mirror of
https://github.com/eddyem/eddys_snippets.git
synced 2025-12-06 02:35:12 +03:00
add two avx instructions example & simple clear screen
This commit is contained in:
parent
c790bcde1f
commit
1af9716058
9
avx/Makefile
Normal file
9
avx/Makefile
Normal file
@ -0,0 +1,9 @@
|
|||||||
|
CC=gcc
|
||||||
|
CFLAGS= -march=native -O3
|
||||||
|
|
||||||
|
all: add dotproduct
|
||||||
|
|
||||||
|
%: %.c
|
||||||
|
@echo -e "\t\tCC $<"
|
||||||
|
$(CC) $(CFLAGS) -o $@ $<
|
||||||
|
|
||||||
32
avx/add.c
Normal file
32
avx/add.c
Normal file
@ -0,0 +1,32 @@
|
|||||||
|
/* Construct a 256-bit vector from 4 64-bit doubles. Add it to itself
|
||||||
|
* and print the result.
|
||||||
|
*/
|
||||||
|
|
||||||
|
#include <stdio.h>
|
||||||
|
#include <immintrin.h>
|
||||||
|
|
||||||
|
int main() {
|
||||||
|
|
||||||
|
__m256i hello;
|
||||||
|
// Construction from scalars or literals.
|
||||||
|
__m256d a = _mm256_set_pd(1.0, 2.0, 3.0, 4.0);
|
||||||
|
|
||||||
|
// Does GCC generate the correct mov, or (better yet) elide the copy
|
||||||
|
// and pass two of the same register into the add? Let's look at the assembly.
|
||||||
|
__m256d b = _mm256_set_pd(0.0, 0.0, 0.0, 0.0), c;
|
||||||
|
for(int i = 0; i < 1000000000; ++i){
|
||||||
|
// Add the two vectors, interpreting the bits as 4 double-precision
|
||||||
|
// floats.
|
||||||
|
c = _mm256_add_pd(a, b);
|
||||||
|
b = c;
|
||||||
|
|
||||||
|
}
|
||||||
|
|
||||||
|
// Do we ever touch DRAM or will these four be registers?
|
||||||
|
__attribute__ ((aligned (32))) double output[4];
|
||||||
|
_mm256_store_pd(output, c);
|
||||||
|
|
||||||
|
printf("%f %f %f %f\n",
|
||||||
|
output[0], output[1], output[2], output[3]);
|
||||||
|
return 0;
|
||||||
|
}
|
||||||
61
avx/dotproduct.c
Normal file
61
avx/dotproduct.c
Normal file
@ -0,0 +1,61 @@
|
|||||||
|
/* Compute the dot product of two (properly aligned) vectors. */
|
||||||
|
#include <stdio.h>
|
||||||
|
#include <immintrin.h>
|
||||||
|
#include <math.h>
|
||||||
|
|
||||||
|
const int N = 83;
|
||||||
|
|
||||||
|
double slow_dot_product(const double *a, const double *b) {
|
||||||
|
double answer = 0.0;
|
||||||
|
for(int ii = 0; ii < N; ++ii)
|
||||||
|
answer += a[ii]*b[ii];
|
||||||
|
return answer;
|
||||||
|
}
|
||||||
|
|
||||||
|
/* Horizontal add works within 128-bit lanes. Use scalar ops to add
|
||||||
|
* across the boundary. */
|
||||||
|
double reduce_vector1(__m256d input) {
|
||||||
|
__m256d temp = _mm256_hadd_pd(input, input);
|
||||||
|
return ((double*)&temp)[0] + ((double*)&temp)[2];
|
||||||
|
}
|
||||||
|
|
||||||
|
/* Another way to get around the 128-bit boundary: grab the first 128
|
||||||
|
* bits, grab the lower 128 bits and then add them together with a 128
|
||||||
|
* bit add instruction. */
|
||||||
|
double reduce_vector2(__m256d input) {
|
||||||
|
__m256d temp = _mm256_hadd_pd(input, input);
|
||||||
|
__m128d sum_high = _mm256_extractf128_pd(temp, 1);
|
||||||
|
__m128d result = _mm_add_pd(sum_high, _mm256_castpd256_pd128(temp));
|
||||||
|
return ((double*)&result)[0];
|
||||||
|
}
|
||||||
|
|
||||||
|
double dot_product(const double *a, const double *b) {
|
||||||
|
__m256d sum_vec = _mm256_set_pd(0.0, 0.0, 0.0, 0.0);
|
||||||
|
|
||||||
|
/* Add up partial dot-products in blocks of 256 bits */
|
||||||
|
for(int ii = 0; ii < N/4; ++ii) {
|
||||||
|
__m256d x = _mm256_load_pd(a+4*ii);
|
||||||
|
__m256d y = _mm256_load_pd(b+4*ii);
|
||||||
|
__m256d z = _mm256_mul_pd(x,y);
|
||||||
|
sum_vec = _mm256_add_pd(sum_vec, z);
|
||||||
|
}
|
||||||
|
|
||||||
|
/* Find the partial dot-product for the remaining elements after
|
||||||
|
* dealing with all 256-bit blocks. */
|
||||||
|
double final = 0.0;
|
||||||
|
for(int ii = N-N%4; ii < N; ++ii)
|
||||||
|
final += a[ii] * b[ii];
|
||||||
|
|
||||||
|
return reduce_vector2(sum_vec) + final;
|
||||||
|
}
|
||||||
|
|
||||||
|
int main() {
|
||||||
|
__attribute__ ((aligned (32))) double a[N], b[N];
|
||||||
|
|
||||||
|
for(int ii = 0; ii < N; ++ii)
|
||||||
|
a[ii] = b[ii] = ii/sqrt(N);
|
||||||
|
|
||||||
|
double answer = dot_product(a, b);
|
||||||
|
printf("%f\n", answer);
|
||||||
|
printf("%f\n", slow_dot_product(a,b));
|
||||||
|
}
|
||||||
21
clr.c
Normal file
21
clr.c
Normal file
@ -0,0 +1,21 @@
|
|||||||
|
// make a simple "CLS"
|
||||||
|
|
||||||
|
#include <stdio.h>
|
||||||
|
#include <unistd.h>
|
||||||
|
|
||||||
|
void printstrings(const char *add){
|
||||||
|
for(int i = 0; i < 40; ++i)
|
||||||
|
printf("String %d - %s\n", i, add);
|
||||||
|
}
|
||||||
|
|
||||||
|
const char *x[] = {"first", "second", "third"};
|
||||||
|
|
||||||
|
int main(){
|
||||||
|
for(int i = 0; i < 3; ++i){
|
||||||
|
printf("\033c");
|
||||||
|
printstrings(x[i]);
|
||||||
|
sleep(1);
|
||||||
|
}
|
||||||
|
printf("\033c");
|
||||||
|
return 0;
|
||||||
|
}
|
||||||
Loading…
x
Reference in New Issue
Block a user