Commit 4626a85d authored by Martino's avatar Martino

Improved intrinsic version: now it's (20us) slower than the normal one

parent a3d5d0f8
...@@ -19,53 +19,41 @@ void nr_polar_kernal_operation(uint8_t *u, uint8_t *d, uint16_t N) ...@@ -19,53 +19,41 @@ void nr_polar_kernal_operation(uint8_t *u, uint8_t *d, uint16_t N)
d[i]=d[i] ^ (!( (j-i)& i ))*u[j]; d[i]=d[i] ^ (!( (j-i)& i ))*u[j];
} }
} }
/* /*
* It works, but there are too many moves from memory and it's slow. With AVX-512 it could be done faster * It works, but there are too many moves from memory and it's slow. With AVX-512 it could be done faster
* *
__m256i A,B,C,E, OUT; __m256i A,B,C,E,U, OUT;
__m256i inc;
uint32_t dTest[8]; uint32_t dTest[8];
uint32_t jiArray[8];
uint32_t iArray[8];
uint32_t uArray[8]; uint32_t uArray[8];
uint32_t k; uint32_t k;
uint32_t toCheck[8]; uint32_t toCheck[8];
uint32_t incArray[8];
for(k=0; k<8; k++)
incArray[k]=k; //0,1, ... 7
inc=_mm256_loadu_si256((__m256i const*)incArray);
for(i=0; i<N; i+=8) for(i=0; i<N; i+=8)
{ {
iArray[0]=i;
iArray[1]=i+1;
iArray[2]=i+2;
iArray[3]=i+3;
iArray[4]=i+4;
iArray[5]=i+5;
iArray[6]=i+6;
iArray[7]=i+7;
B=_mm256_set1_epi32((int)i); // i, ... i
B=_mm256_add_epi32(B, inc); //i, i+1, ... i+7
OUT=_mm256_setzero_si256(); OUT=_mm256_setzero_si256();
for(j=0; j<N; j++)
for(j=0; j<N; j++)
{ {
//initialisation //initialisation
jiArray[0]=j-i; A=_mm256_set1_epi32((int)(j)); //j ...
jiArray[1]=j-(i+1); A=_mm256_sub_epi32(A, B); //(j-i), (j-(i+1)), ... (j-(i+7))
jiArray[2]=j-(i+2);
jiArray[3]=j-(i+3); U=_mm256_set1_epi32((int)u[j]);
jiArray[4]=j-(i+4); _mm256_storeu_si256((__m256i*)uArray, U); //u(j) ... u(j) for the maskload
jiArray[5]=j-(i+5);
jiArray[6]=j-(i+6);
jiArray[7]=j-(i+7);
uArray[0]=(uint32_t)u[j];
uArray[1]=(uint32_t)u[j];
uArray[2]=(uint32_t)u[j];
uArray[3]=(uint32_t)u[j];
uArray[4]=(uint32_t)u[j];
uArray[5]=(uint32_t)u[j];
uArray[6]=(uint32_t)u[j];
uArray[7]=(uint32_t)u[j];
A=_mm256_loadu_si256((__m256i const*)jiArray);
B=_mm256_loadu_si256((__m256i const*)iArray);
C=_mm256_and_si256(A, B); //mask: if zero, then add C=_mm256_and_si256(A, B); //mask: if zero, then add
_mm256_storeu_si256((__m256i*)toCheck, C); _mm256_storeu_si256((__m256i*)toCheck, C);
...@@ -79,7 +67,7 @@ void nr_polar_kernal_operation(uint8_t *u, uint8_t *d, uint16_t N) ...@@ -79,7 +67,7 @@ void nr_polar_kernal_operation(uint8_t *u, uint8_t *d, uint16_t N)
OUT=_mm256_xor_si256(OUT, E); //32 bit x 8 OUT=_mm256_xor_si256(OUT, E); //32 bit x 8
} }
_mm256_storeu_si256((__m256i*)&dTest, OUT); _mm256_storeu_si256((__m256i*)dTest, OUT);
for(k=0; k<8; k++) for(k=0; k<8; k++)
{ {
......
Markdown is supported
0%
or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment