33 lines
871 B
C

/* Construct a 256-bit vector from 4 64-bit doubles. Add it to itself
* and print the result.
*/
#include <stdio.h>
#include <immintrin.h>
int main() {
__m256i hello;
// Construction from scalars or literals.
__m256d a = _mm256_set_pd(1.0, 2.0, 3.0, 4.0);
// Does GCC generate the correct mov, or (better yet) elide the copy
// and pass two of the same register into the add? Let's look at the assembly.
__m256d b = _mm256_set_pd(0.0, 0.0, 0.0, 0.0), c;
for(int i = 0; i < 1000000000; ++i){
// Add the two vectors, interpreting the bits as 4 double-precision
// floats.
c = _mm256_add_pd(a, b);
b = c;
}
// Do we ever touch DRAM or will these four be registers?
__attribute__ ((aligned (32))) double output[4];
_mm256_store_pd(output, c);
printf("%f %f %f %f\n",
output[0], output[1], output[2], output[3]);
return 0;
}