AVX512F intrinsics added for shortBlockCoding

41a262c2 · yilmazt · d7e537fa · 41a262c2 · 41a262c2
Commit 41a262c2 authored Mar 12, 2019 by yilmazt
2 changed files
--- a/openair1/PHY/CODING/nrPolar_tools/nr_polar_decoding_tools.c
+++ b/openair1/PHY/CODING/nrPolar_tools/nr_polar_decoding_tools.c
@@ -362,7 +362,7 @@ void applyFtoleft(t_nrPolar_params *pp, decoder_node_t *node) {
    else
 #endif
-    { // equvalent scalar code to above, activated only on non x86/ARM architectures
+    { // equivalent scalar code to above, activated only on non x86/ARM architectures
      for (int i=0;i<node->Nv/2;i++) {
    	  a=alpha_v[i];
    	  b=alpha_v[i+(node->Nv/2)];
@@ -442,7 +442,7 @@ void applyGtoright(t_nrPolar_params *pp,decoder_node_t *node) {
    }
    else 
 #endif
-      {// equvalent scalar code to above, activated only on non x86/ARM architectures or Nv=1,2
+      {// equivalent scalar code to above, activated only on non x86/ARM architectures or Nv=1,2
 	for (int i=0;i<node->Nv/2;i++) {
 	  alpha_r[i] = alpha_v[i+(node->Nv/2)] - (betal[i]*alpha_v[i]);
 	}

--- a/openair1/PHY/CODING/nrSmallBlock/decodeSmallBlock.c
+++ b/openair1/PHY/CODING/nrSmallBlock/decodeSmallBlock.c
@@ -36,6 +36,9 @@
 //#define DEBUG_DECODESMALLBLOCK
+//input = [d̂_0] [d̂_1] [d̂_2] ... [d̂_31]
+//output = [? ... ? ĉ_K-1 ... ĉ_2 ĉ_1 ĉ_0]
 uint16_t decodeSmallBlock(int8_t *in, uint8_t len){
 	uint16_t out = 0;
@@ -80,23 +83,58 @@ uint16_t decodeSmallBlock(int8_t *in, uint8_t len){
 #endif
 	} else {
+		uint8_t maxRow = 0, maxCol = 0;
+#if defined(__AVX2__)
+        int16_t maxVal = 0;
+		int DmatrixElementVal = 0;
+		int8_t DmatrixElement[NR_SMALL_BLOCK_CODED_BITS] = {0};
+		__m256i _in_256 = _mm256_loadu_si256 ((__m256i*)&in[0]);
+		__m256i _maskD_256, _Dmatrixj_256, _maskH_256, _DmatrixElement_256;
+		for (int j = 0; j < ( 1<<(len-6) ); ++j) {
+			_maskD_256 = _mm256_loadu_si256 ((__m256i*)(&maskD[j][0]));
+			_Dmatrixj_256 = _mm256_sign_epi8 (_in_256, _maskD_256);
+			for (int k = 0; k < NR_SMALL_BLOCK_CODED_BITS; ++k) {
+				_maskH_256 = _mm256_loadu_si256 ((__m256i*)(&hadamard32InterleavedTransposed[k][0]));
+				_DmatrixElement_256 = _mm256_sign_epi8 (_Dmatrixj_256, _maskH_256);
+#if defined(__AVX512F__)
+			    DmatrixElementVal = _mm512_reduce_add_epi32 (
+			    		            _mm512_add_epi32(
+			    				    _mm512_cvtepi8_epi32 (_mm256_extracti128_si256 (_DmatrixElement_256, 1)),
+								    _mm512_cvtepi8_epi32 (_mm256_castsi256_si128 (_DmatrixElement_256))
+			    		            				)
+															);
+#else
+				_mm256_storeu_si256((__m256i*)(&DmatrixElement[0]), _DmatrixElement_256);
+				for (int i = 0; i < NR_SMALL_BLOCK_CODED_BITS; ++i)
+					DmatrixElementVal += DmatrixElement[i];
+#endif
+				if (abs(DmatrixElementVal) > abs(maxVal)){
+					maxVal = DmatrixElementVal;
+					maxRow = j;
+					maxCol = k;
+				}
+				DmatrixElementVal=0;
+			}
+		}
+		out = properOrderedBasisExtended[maxRow] | properOrderedBasis[maxCol] | ( (maxVal > 0) ? (uint16_t)0 : (uint16_t)1 );
+#else
 		int8_t Dmatrix[NR_SMALL_BLOCK_CODED_BITS][NR_SMALL_BLOCK_CODED_BITS] = {0};
 		int16_t DmatrixFHT[NR_SMALL_BLOCK_CODED_BITS][NR_SMALL_BLOCK_CODED_BITS] = {0};
 		uint16_t maxVal;
-		uint8_t maxRow = 0, maxCol = 0;
+		uint8_t rowLimit = 1<<(len-6);
-		for (int j = 0; j < NR_SMALL_BLOCK_CODED_BITS; ++j)
+		for (int j = 0; j < ( rowLimit ); ++j)
 			for (int k = 0; k < NR_SMALL_BLOCK_CODED_BITS; ++k)
 				Dmatrix[j][k] = in[k] * maskD[j][k];
+		for (int i = 0; i < ( rowLimit ); ++i)
-		for (int i = 0; i < NR_SMALL_BLOCK_CODED_BITS; ++i)
 			for (int j = 0; j < NR_SMALL_BLOCK_CODED_BITS; ++j)
 				for (int k = 0; k < NR_SMALL_BLOCK_CODED_BITS; ++k)
 					DmatrixFHT[i][j] += Dmatrix[i][k] * hadamard32InterleavedTransposed[j][k];
 		maxVal = abs(DmatrixFHT[0][0]);
-		for (int i = 0; i < NR_SMALL_BLOCK_CODED_BITS; ++i)
+		for (int i = 0; i < ( rowLimit ); ++i)
 			for (int j = 0; j < NR_SMALL_BLOCK_CODED_BITS; ++j)
 				if (abs(DmatrixFHT[i][j]) > maxVal){
 					maxVal = abs(DmatrixFHT[i][j]);
@@ -105,6 +143,7 @@ uint16_t decodeSmallBlock(int8_t *in, uint8_t len){
 				}
 		out = properOrderedBasisExtended[maxRow] | properOrderedBasis[maxCol] | ( (DmatrixFHT[maxRow][maxCol] > 0) ? (uint16_t)0 : (uint16_t)1 );
+#endif
 #ifdef DEBUG_DECODESMALLBLOCK
 		for (int k = 0; k < NR_SMALL_BLOCK_CODED_BITS; ++k)