mirror of
https://github.com/eddyem/eddys_snippets.git
synced 2025-12-06 10:45:12 +03:00
33 lines
871 B
C
33 lines
871 B
C
/* Construct a 256-bit vector from 4 64-bit doubles. Add it to itself
|
|
* and print the result.
|
|
*/
|
|
|
|
#include <stdio.h>
|
|
#include <immintrin.h>
|
|
|
|
int main() {
|
|
|
|
__m256i hello;
|
|
// Construction from scalars or literals.
|
|
__m256d a = _mm256_set_pd(1.0, 2.0, 3.0, 4.0);
|
|
|
|
// Does GCC generate the correct mov, or (better yet) elide the copy
|
|
// and pass two of the same register into the add? Let's look at the assembly.
|
|
__m256d b = _mm256_set_pd(0.0, 0.0, 0.0, 0.0), c;
|
|
for(int i = 0; i < 1000000000; ++i){
|
|
// Add the two vectors, interpreting the bits as 4 double-precision
|
|
// floats.
|
|
c = _mm256_add_pd(a, b);
|
|
b = c;
|
|
|
|
}
|
|
|
|
// Do we ever touch DRAM or will these four be registers?
|
|
__attribute__ ((aligned (32))) double output[4];
|
|
_mm256_store_pd(output, c);
|
|
|
|
printf("%f %f %f %f\n",
|
|
output[0], output[1], output[2], output[3]);
|
|
return 0;
|
|
}
|