add two avx instructions example & simple clear screen

2026-01-31 20:35:15 +03:00 · 2020-12-17 17:49:05 +03:00 · 2020-12-17 17:49:05 +03:00 · 1af9716058
commit 1af9716058
parent c790bcde1f
4 changed files with 123 additions and 0 deletions
--- a/avx/Makefile
+++ b/avx/Makefile
@ -0,0 +1,9 @@
 CC=gcc
 CFLAGS= -march=native -O3
 all: add dotproduct
 %: %.c
 	@echo -e "\t\tCC $<"
 	$(CC) $(CFLAGS) -o $@ $<
--- a/avx/add.c
+++ b/avx/add.c
@ -0,0 +1,32 @@
 /* Construct a 256-bit vector from 4 64-bit doubles. Add it to itself
 * and print the result.
 */
 #include <stdio.h>
 #include <immintrin.h>
 int main() {
  __m256i hello;
  // Construction from scalars or literals.
  __m256d a = _mm256_set_pd(1.0, 2.0, 3.0, 4.0);
  // Does GCC generate the correct mov, or (better yet) elide the copy
  // and pass two of the same register into the add? Let's look at the assembly.
  __m256d b = _mm256_set_pd(0.0, 0.0, 0.0, 0.0), c;
 for(int i = 0; i < 1000000000; ++i){
  // Add the two vectors, interpreting the bits as 4 double-precision
  // floats.
  c = _mm256_add_pd(a, b);
  b = c;
 }
  // Do we ever touch DRAM or will these four be registers?
  __attribute__ ((aligned (32))) double output[4];
  _mm256_store_pd(output, c);
  printf("%f %f %f %f\n",
         output[0], output[1], output[2], output[3]);
  return 0;
 }
--- a/avx/dotproduct.c
+++ b/avx/dotproduct.c
@ -0,0 +1,61 @@
 /* Compute the dot product of two (properly aligned) vectors. */
 #include <stdio.h>
 #include <immintrin.h>
 #include <math.h>
 const int N = 83;
 double slow_dot_product(const double *a, const double *b) {
    double answer = 0.0;
    for(int ii = 0; ii < N; ++ii)
        answer += a[ii]*b[ii];
    return answer;
 }
 /* Horizontal add works within 128-bit lanes. Use scalar ops to add
 * across the boundary. */
 double reduce_vector1(__m256d input) {
    __m256d temp = _mm256_hadd_pd(input, input);
    return ((double*)&temp)[0] + ((double*)&temp)[2];
 }
 /* Another way to get around the 128-bit boundary: grab the first 128
 * bits, grab the lower 128 bits and then add them together with a 128
 * bit add instruction. */
 double reduce_vector2(__m256d input) {
    __m256d temp = _mm256_hadd_pd(input, input);
    __m128d sum_high = _mm256_extractf128_pd(temp, 1);
    __m128d result = _mm_add_pd(sum_high, _mm256_castpd256_pd128(temp));
    return ((double*)&result)[0];
 }
 double dot_product(const double *a, const double *b) {
    __m256d sum_vec = _mm256_set_pd(0.0, 0.0, 0.0, 0.0);
    /* Add up partial dot-products in blocks of 256 bits */
    for(int ii = 0; ii < N/4; ++ii) {
        __m256d x = _mm256_load_pd(a+4*ii);
        __m256d y = _mm256_load_pd(b+4*ii);
        __m256d z = _mm256_mul_pd(x,y);
        sum_vec = _mm256_add_pd(sum_vec, z);
    }
    /* Find the partial dot-product for the remaining elements after
    * dealing with all 256-bit blocks. */
    double final = 0.0;
    for(int ii = N-N%4; ii < N; ++ii)
        final += a[ii] * b[ii];
    return reduce_vector2(sum_vec) + final;
 }
 int main() {
    __attribute__ ((aligned (32))) double a[N], b[N];
    for(int ii = 0; ii < N; ++ii)
        a[ii] = b[ii] = ii/sqrt(N);
    double answer = dot_product(a, b);
    printf("%f\n", answer);
    printf("%f\n", slow_dot_product(a,b));
 }
--- a/clr.c
+++ b/clr.c
@ -0,0 +1,21 @@
 // make a simple "CLS"
 #include <stdio.h>
 #include <unistd.h>
 void printstrings(const char *add){
    for(int i = 0; i < 40; ++i)
        printf("String %d - %s\n", i, add);
 }
 const char *x[] = {"first", "second", "third"};
 int main(){
    for(int i = 0; i < 3; ++i){
        printf("\033c");
        printstrings(x[i]);
        sleep(1);
    }
    printf("\033c");
    return 0;
 }