From 1af971605812251a265a71cc4e6dd1430ea01761 Mon Sep 17 00:00:00 2001 From: Edward Emelianov Date: Thu, 17 Dec 2020 17:49:05 +0300 Subject: [PATCH] add two avx instructions example & simple clear screen --- avx/Makefile | 9 +++++++ avx/add.c | 32 +++++++++++++++++++++++++ avx/dotproduct.c | 61 ++++++++++++++++++++++++++++++++++++++++++++++++ clr.c | 21 +++++++++++++++++ 4 files changed, 123 insertions(+) create mode 100644 avx/Makefile create mode 100644 avx/add.c create mode 100644 avx/dotproduct.c create mode 100644 clr.c diff --git a/avx/Makefile b/avx/Makefile new file mode 100644 index 0000000..eebf8e4 --- /dev/null +++ b/avx/Makefile @@ -0,0 +1,9 @@ +CC=gcc +CFLAGS= -march=native -O3 + +all: add dotproduct + +%: %.c + @echo -e "\t\tCC $<" + $(CC) $(CFLAGS) -o $@ $< + diff --git a/avx/add.c b/avx/add.c new file mode 100644 index 0000000..08c0724 --- /dev/null +++ b/avx/add.c @@ -0,0 +1,32 @@ +/* Construct a 256-bit vector from 4 64-bit doubles. Add it to itself + * and print the result. + */ + +#include +#include + +int main() { + + __m256i hello; + // Construction from scalars or literals. + __m256d a = _mm256_set_pd(1.0, 2.0, 3.0, 4.0); + + // Does GCC generate the correct mov, or (better yet) elide the copy + // and pass two of the same register into the add? Let's look at the assembly. + __m256d b = _mm256_set_pd(0.0, 0.0, 0.0, 0.0), c; +for(int i = 0; i < 1000000000; ++i){ + // Add the two vectors, interpreting the bits as 4 double-precision + // floats. + c = _mm256_add_pd(a, b); + b = c; + + } + + // Do we ever touch DRAM or will these four be registers? + __attribute__ ((aligned (32))) double output[4]; + _mm256_store_pd(output, c); + + printf("%f %f %f %f\n", + output[0], output[1], output[2], output[3]); + return 0; +} diff --git a/avx/dotproduct.c b/avx/dotproduct.c new file mode 100644 index 0000000..dbdee2c --- /dev/null +++ b/avx/dotproduct.c @@ -0,0 +1,61 @@ +/* Compute the dot product of two (properly aligned) vectors. */ +#include +#include +#include + +const int N = 83; + +double slow_dot_product(const double *a, const double *b) { + double answer = 0.0; + for(int ii = 0; ii < N; ++ii) + answer += a[ii]*b[ii]; + return answer; +} + +/* Horizontal add works within 128-bit lanes. Use scalar ops to add + * across the boundary. */ +double reduce_vector1(__m256d input) { + __m256d temp = _mm256_hadd_pd(input, input); + return ((double*)&temp)[0] + ((double*)&temp)[2]; +} + +/* Another way to get around the 128-bit boundary: grab the first 128 + * bits, grab the lower 128 bits and then add them together with a 128 + * bit add instruction. */ +double reduce_vector2(__m256d input) { + __m256d temp = _mm256_hadd_pd(input, input); + __m128d sum_high = _mm256_extractf128_pd(temp, 1); + __m128d result = _mm_add_pd(sum_high, _mm256_castpd256_pd128(temp)); + return ((double*)&result)[0]; +} + +double dot_product(const double *a, const double *b) { + __m256d sum_vec = _mm256_set_pd(0.0, 0.0, 0.0, 0.0); + + /* Add up partial dot-products in blocks of 256 bits */ + for(int ii = 0; ii < N/4; ++ii) { + __m256d x = _mm256_load_pd(a+4*ii); + __m256d y = _mm256_load_pd(b+4*ii); + __m256d z = _mm256_mul_pd(x,y); + sum_vec = _mm256_add_pd(sum_vec, z); + } + + /* Find the partial dot-product for the remaining elements after + * dealing with all 256-bit blocks. */ + double final = 0.0; + for(int ii = N-N%4; ii < N; ++ii) + final += a[ii] * b[ii]; + + return reduce_vector2(sum_vec) + final; +} + +int main() { + __attribute__ ((aligned (32))) double a[N], b[N]; + + for(int ii = 0; ii < N; ++ii) + a[ii] = b[ii] = ii/sqrt(N); + + double answer = dot_product(a, b); + printf("%f\n", answer); + printf("%f\n", slow_dot_product(a,b)); +} diff --git a/clr.c b/clr.c new file mode 100644 index 0000000..cfc9697 --- /dev/null +++ b/clr.c @@ -0,0 +1,21 @@ +// make a simple "CLS" + +#include +#include + +void printstrings(const char *add){ + for(int i = 0; i < 40; ++i) + printf("String %d - %s\n", i, add); +} + +const char *x[] = {"first", "second", "third"}; + +int main(){ + for(int i = 0; i < 3; ++i){ + printf("\033c"); + printstrings(x[i]); + sleep(1); + } + printf("\033c"); + return 0; +}