New encoding implementation (with AVX 512 commented below)

69767e7d · martino · a77e31b5 · 69767e7d · 69767e7d
Commit 69767e7d authored Jun 06, 2018 by martino
2 changed files
--- a/openair1/PHY/CODING/nrPolar_tools/nr_polar_encoder.c
+++ b/openair1/PHY/CODING/nrPolar_tools/nr_polar_encoder.c
@@ -53,7 +53,12 @@ void polar_encoder(
 			polarParams->Q_I_N, polarParams->Q_PC_N, polarParams->n_pc);

 	//Encoding (u to d)
-	nr_matrix_multiplication_uint8_t_1D_uint8_t_2D(polarParams->nr_polar_u, polarParams->G_N, polarParams->nr_polar_d, polarParams->N, polarParams->N);
+	// --- OLD ---
+	//nr_matrix_multiplication_uint8_t_1D_uint8_t_2D(polarParams->nr_polar_u, polarParams->G_N, polarParams->nr_polar_d, polarParams->N, polarParams->N);
+	//for (uint16_t i = 0; i < polarParams->N; i++) polarParams->nr_polar_d[i] = (polarParams->nr_polar_d[i] % 2);
+	// --- NEW ---
+	nr_polar_kernal_operation(polarParams->nr_polar_u, polarParams->nr_polar_d, polarParams->N);
+
 	for (uint16_t i = 0; i < polarParams->N; i++) polarParams->nr_polar_d[i] = (polarParams->nr_polar_d[i] % 2);

 	//Rate matching

--- a/openair1/PHY/CODING/nrPolar_tools/nr_polar_kernal_operation.c
+++ b/openair1/PHY/CODING/nrPolar_tools/nr_polar_kernal_operation.c
@@ -3,21 +3,81 @@
 #include <math.h>
 #include <stdint.h>

-void nr_polar_kernel_operation(uint8_t *u, uint8_t *d, uint16_t N)
+#include <immintrin.h>
+
+void nr_polar_kernal_operation(uint8_t *u, uint8_t *d, uint16_t N)
 {
-    // Martino's algorithm to avoid multiplication for the generating matrix
-
-    int i,j;
-    printf("\nd = ");
-	for(i=0; i<N; i++)
-    {
-        d[i]=0;
-        for(j=0; j<N; j++)
+	// Martino's algorithm to avoid multiplication for the generating matrix of polar codes
+	
+	uint16_t i,j;
+ 
+	for(i=0; i<N; i++) // Create the elements of d=u*G_N ...
+    	{
+        	d[i]=0;
+        	for(j=0; j<N; j++) // ... looking at all the elements of u
+        	{
+            		d[i]=d[i]+( (!(j-i)) | (!i) )*u[j];
+        	}
+
+		d[i]=d[i]%2; // modulo 2
+    	}
+
+/*
+	__m256i maddReg, uReg, orReg;
+	__m512i maddRegConv;
+	__m256i bitJIReg, bitIReg;
+	uint8_t bitJI[32];
+	uint8_t bitI[32];
+	int sumPartial;
+	uint8_t indToInit;
+
+        for(i=0; i<N; i++) // Create the elements of d=u*G_N ...
        {
-            d[i]=d[i]+(( (j-i)& i )==0)*u[j];
+                d[i]=0;
+                for(j=0; j<N; j+=32) // ... looking at all the elements of u 32 at a time
+                {
+                        //d[i]=d[i]+( (!(j-i)) | (!i) )*u[j];  <--- THIS IN INTRINSIC
+			// Products between ( (!(j-i)) | (!i) ) and u[j] and sum all with a reduce add
+
+			uReg = _mm256_maskz_loadu_epi8 (0xFFFFFFFF, (void const*)&u[j]); // load 32 8-bit from u
+			
+			//init arrays for (!(i-j)) and for (!i)
+			for(indToInit=0; indToInit<32; indToInit++)
+			{
+				// j = j*32+indToInit
+				bitJI[j*32+indToInit] = !((j*32+indToInit)-i); // (!(j-i))
+				bitI[j*32+indToInit] = !i; // (!i)
+			}
+	 	
+			bitJIReg = _mm256_maskz_loadu_epi8(0xFFFFFFFF, (void const*)bitJI); // 32x8-bit
+			bitIReg = _mm256_maskz_loadu_epi8(0xFFFFFFFF, (void const*)bitI);   // 32x8-bit
+			orReg=_mm256_or_si256(bitWise1, bitWise2); // (!(j-i)) | (!i)   32x8-bit
+			maddReg=_mm256_maddubs_epi16(uReg, orReg); //a1*b1+a2*b2 from 32x8 to 16x16-bit
+			maddRegConv= _mm512_cvtepi16_epi32(maddReg); //convert to 16x32-bit
+			sumPartial = _mm512_reduce_add_epi32(maddRegConv); //sum all 16 values
+
+			d[i] = d[i] + sumPartial; //store in the final variable
+                }
+
+                d[i]=d[i]%2; // modulo 2
        }
-        d[i]=d[i]%2;
+*/

-        printf("%i", d[i]);
-    }
+/*
+ __m128 num1, num2, num3, num4;
+
+        for (uint16_t i = 0; i < col; i++) {
+        num4=_mm_setzero_ps(); //sets sum to zero
+                for (uint16_t j = 0; j < row; j+=4) {
+                        //output[i] += matrix1[j] * matrix2[j][i];
+                        num1=_mm_load_ps((float*)&matrix1[j]); // 1[3], 1[2], 1[1], 1[0] -> num1
+                        num2=_mm_load_ps((float*)&matrix2[j][i]); // 2[3], 2[2], 2[1], 2[0] -> num2
+                        num3=_mm_mul_ps(num1, num2); // 1[3]*2[3],...1[0]*2[0] -> num3
+                        num3=_mm_hadd_ps(num3, num3); //1[3]*2[3]+1[2]*2[2] ... 
+                        num4 = _mm_add_ps(num4, num3);
+                }
+                num4= _mm_hadd_ps(num4, num4);
+                _mm_store_ss(&output[i], num4); // Stores only the lower SP FP that contain the sum
+        }
+*/
 }