From 1af971605812251a265a71cc4e6dd1430ea01761 Mon Sep 17 00:00:00 2001
From: Edward Emelianov <eddy@sao.ru>
Date: Thu, 17 Dec 2020 17:49:05 +0300
Subject: [PATCH] add two avx instructions example & simple clear screen

---
 avx/Makefile     |  9 +++++++
 avx/add.c        | 32 +++++++++++++++++++++++++
 avx/dotproduct.c | 61 ++++++++++++++++++++++++++++++++++++++++++++++++
 clr.c            | 21 +++++++++++++++++
 4 files changed, 123 insertions(+)
 create mode 100644 avx/Makefile
 create mode 100644 avx/add.c
 create mode 100644 avx/dotproduct.c
 create mode 100644 clr.c

diff --git a/avx/Makefile b/avx/Makefile
new file mode 100644
index 0000000..eebf8e4
--- /dev/null
+++ b/avx/Makefile
@@ -0,0 +1,9 @@
+CC=gcc
+CFLAGS= -march=native -O3
+
+all: add dotproduct
+
+%: %.c
+	@echo -e "\t\tCC $<"
+	$(CC) $(CFLAGS) -o $@ $<
+
diff --git a/avx/add.c b/avx/add.c
new file mode 100644
index 0000000..08c0724
--- /dev/null
+++ b/avx/add.c
@@ -0,0 +1,32 @@
+/* Construct a 256-bit vector from 4 64-bit doubles. Add it to itself
+ * and print the result.
+ */
+
+#include <stdio.h>
+#include <immintrin.h>
+
+int main() {
+
+  __m256i hello;
+  // Construction from scalars or literals.
+  __m256d a = _mm256_set_pd(1.0, 2.0, 3.0, 4.0);
+
+  // Does GCC generate the correct mov, or (better yet) elide the copy
+  // and pass two of the same register into the add? Let's look at the assembly.
+  __m256d b = _mm256_set_pd(0.0, 0.0, 0.0, 0.0), c;
+for(int i = 0; i < 1000000000; ++i){
+  // Add the two vectors, interpreting the bits as 4 double-precision
+  // floats.
+  c = _mm256_add_pd(a, b);
+  b = c;
+  
+ }
+
+  // Do we ever touch DRAM or will these four be registers?
+  __attribute__ ((aligned (32))) double output[4];
+  _mm256_store_pd(output, c);
+
+  printf("%f %f %f %f\n",
+         output[0], output[1], output[2], output[3]);
+  return 0;
+}
diff --git a/avx/dotproduct.c b/avx/dotproduct.c
new file mode 100644
index 0000000..dbdee2c
--- /dev/null
+++ b/avx/dotproduct.c
@@ -0,0 +1,61 @@
+/* Compute the dot product of two (properly aligned) vectors. */
+#include <stdio.h>
+#include <immintrin.h>
+#include <math.h>
+
+const int N = 83;
+
+double slow_dot_product(const double *a, const double *b) {
+    double answer = 0.0;
+    for(int ii = 0; ii < N; ++ii)
+        answer += a[ii]*b[ii];
+    return answer;
+}
+
+/* Horizontal add works within 128-bit lanes. Use scalar ops to add
+ * across the boundary. */
+double reduce_vector1(__m256d input) {
+    __m256d temp = _mm256_hadd_pd(input, input);
+    return ((double*)&temp)[0] + ((double*)&temp)[2];
+}
+
+/* Another way to get around the 128-bit boundary: grab the first 128
+ * bits, grab the lower 128 bits and then add them together with a 128
+ * bit add instruction. */
+double reduce_vector2(__m256d input) {
+    __m256d temp = _mm256_hadd_pd(input, input);
+    __m128d sum_high = _mm256_extractf128_pd(temp, 1);
+    __m128d result = _mm_add_pd(sum_high, _mm256_castpd256_pd128(temp));
+    return ((double*)&result)[0];
+}
+
+double dot_product(const double *a, const double *b) {
+    __m256d sum_vec = _mm256_set_pd(0.0, 0.0, 0.0, 0.0);
+
+    /* Add up partial dot-products in blocks of 256 bits */
+    for(int ii = 0; ii < N/4; ++ii) {
+        __m256d x = _mm256_load_pd(a+4*ii);
+        __m256d y = _mm256_load_pd(b+4*ii);
+        __m256d z = _mm256_mul_pd(x,y);
+        sum_vec = _mm256_add_pd(sum_vec, z);
+    }
+
+    /* Find the partial dot-product for the remaining elements after
+    * dealing with all 256-bit blocks. */
+    double final = 0.0;
+    for(int ii = N-N%4; ii < N; ++ii)
+        final += a[ii] * b[ii];
+
+    return reduce_vector2(sum_vec) + final;
+}
+
+int main() {
+    __attribute__ ((aligned (32))) double a[N], b[N];
+
+    for(int ii = 0; ii < N; ++ii)
+        a[ii] = b[ii] = ii/sqrt(N);
+
+    double answer = dot_product(a, b);
+    printf("%f\n", answer);
+    printf("%f\n", slow_dot_product(a,b));
+}
diff --git a/clr.c b/clr.c
new file mode 100644
index 0000000..cfc9697
--- /dev/null
+++ b/clr.c
@@ -0,0 +1,21 @@
+// make a simple "CLS"
+
+#include <stdio.h>
+#include <unistd.h>
+
+void printstrings(const char *add){
+    for(int i = 0; i < 40; ++i)
+        printf("String %d - %s\n", i, add);
+}
+
+const char *x[] = {"first", "second", "third"};
+
+int main(){
+    for(int i = 0; i < 3; ++i){
+        printf("\033c");
+        printstrings(x[i]);
+        sleep(1);
+    }
+    printf("\033c");
+    return 0;
+}