/* Construct a 256-bit vector from 4 64-bit doubles. Add it to itself * and print the result. */ #include #include int main() { __m256i hello; // Construction from scalars or literals. __m256d a = _mm256_set_pd(1.0, 2.0, 3.0, 4.0); // Does GCC generate the correct mov, or (better yet) elide the copy // and pass two of the same register into the add? Let's look at the assembly. __m256d b = _mm256_set_pd(0.0, 0.0, 0.0, 0.0), c; for(int i = 0; i < 1000000000; ++i){ // Add the two vectors, interpreting the bits as 4 double-precision // floats. c = _mm256_add_pd(a, b); b = c; } // Do we ever touch DRAM or will these four be registers? __attribute__ ((aligned (32))) double output[4]; _mm256_store_pd(output, c); printf("%f %f %f %f\n", output[0], output[1], output[2], output[3]); return 0; }