Final version of kernal function, now we save 320 us with the intrinsics

d4fbc3a1 · Martino · 4626a85d · d4fbc3a1
Commit d4fbc3a1 authored Jun 18, 2018 by Martino
Show whitespace changes
Inline Side-by-side

Showing with 33 additions and 37 deletions

openair1/PHY/CODING/nrPolar_tools/nr_polar_kernal_operation.c ...air1/PHY/CODING/nrPolar_tools/nr_polar_kernal_operation.c +33 -37

No files found.
--- a/openair1/PHY/CODING/nrPolar_tools/nr_polar_kernal_operation.c
+++ b/openair1/PHY/CODING/nrPolar_tools/nr_polar_kernal_operation.c
@@ -11,69 +11,65 @@ void nr_polar_kernal_operation(uint8_t *u, uint8_t *d, uint16_t N)
 	uint32_t i,j;
-	for(i=0; i<N; i++) // Create the elements of d=u*G_N ...
+	#ifdef __AVX2__
-    	{
-        	d[i]=0;
-        	for(j=0; j<N; j++) // ... looking at all the elements of u
-        	{
-			d[i]=d[i] ^ (!( (j-i)& i ))*u[j];
-        	}
-    	}
+	__m256i A,B,C,D,E,U,zerosOnly, OUT;
-/*
- * It works, but there are too many moves from memory and it's slow. With AVX-512 it could be done faster
- *
-	__m256i A,B,C,E,U, OUT;
 	__m256i inc;
 	uint32_t dTest[8];
 	uint32_t uArray[8];
 	uint32_t k;	
-	uint32_t toCheck[8];
 	uint32_t incArray[8];
+	//initialisation
 	for(k=0; k<8; k++)
-		incArray[k]=k; //0,1, ... 7
+		incArray[k]=k;
+	inc=_mm256_loadu_si256((__m256i const*)incArray); // 0, 1, ..., 7 to increase
-	inc=_mm256_loadu_si256((__m256i const*)incArray);
+	zerosOnly=_mm256_setzero_si256(); // for comparison
 	for(i=0; i<N; i+=8)
        {
+		B=_mm256_set1_epi32((int)i); // i, ..., i
+		B=_mm256_add_epi32(B, inc); // i, i+1, ..., i+7
-		B=_mm256_set1_epi32((int)i); // i, ... i
+		OUT=_mm256_setzero_si256(); // it will contain the result of all the XORs for the d(i)s
-		B=_mm256_add_epi32(B, inc); //i, i+1, ... i+7
-		OUT=_mm256_setzero_si256();
 		for(j=0; j<N; j++)
 		{
-			//initialisation
+			A=_mm256_set1_epi32((int)(j)); //j, j,  ..., j
-			A=_mm256_set1_epi32((int)(j)); //j ...
 			A=_mm256_sub_epi32(A, B); //(j-i), (j-(i+1)), ... (j-(i+7))  
 			U=_mm256_set1_epi32((int)u[j]);
 			_mm256_storeu_si256((__m256i*)uArray, U); //u(j) ... u(j) for the maskload
-			C=_mm256_and_si256(A, B); //mask: if zero, then add
+			C=_mm256_and_si256(A, B); //(j-i)&i -> If zero, then XOR with the u(j)
+			D=_mm256_cmpeq_epi32(C, zerosOnly); // compare with zero and use the result as mask
-			_mm256_storeu_si256((__m256i*)toCheck, C);
+			E=_mm256_maskload_epi32((int const*)uArray, D); // load only some u(j)s for the XOR
-			for(k=0; k<8; k++)
-                        {
-				toCheck[k]=!toCheck[k] << 31;
-			}
-			C=_mm256_loadu_si256((__m256i const*)toCheck); //mask: if 1, add
-			E=_mm256_maskload_epi32((int const*)uArray, C);
 			OUT=_mm256_xor_si256(OUT, E); //32 bit x 8
 		}
 		_mm256_storeu_si256((__m256i*)dTest, OUT);
-		for(k=0; k<8; k++)
+		for(k=0; k<8; k++) // Conversion from 32 bits to 8 bits
                {	
-		        d[i+k]=(uint8_t)dTest[k]; //Conv from 32 to 8
+		        d[i+k]=(uint8_t)dTest[k]; // With AVX512 there is an intrinsic to do it
                }
 	}
-*/
+	#else
+        for(i=0; i<N; i++) // Create the elements of d=u*G_N ...
+        {
+                d[i]=0;
+                for(j=0; j<N; j++) // ... looking at all the elements of u
+                {
+                        d[i]=d[i] ^ (!( (j-i)& i ))*u[j];
+                        // it's like ((j-i)&i)==0
+                }
+        }
+	#endif
 }