Commit 4542322b authored by Sy's avatar Sy

TEST AVX512

parents 558fccbb 6d9ceaa7
...@@ -697,7 +697,7 @@ function main() { ...@@ -697,7 +697,7 @@ function main() {
if [ "$SIMUS_PHY" = "1" ] ; then if [ "$SIMUS_PHY" = "1" ] ; then
echo_info "Compiling physical unitary tests simulators" echo_info "Compiling physical unitary tests simulators"
# TODO: fix: dlsim_tm4 pucchsim prachsim pdcchsim pbchsim mbmssim # TODO: fix: dlsim_tm4 pucchsim prachsim pdcchsim pbchsim mbmssim
simlist="dlsim ulsim ldpctest polartest smallblocktest nr_pbchsim nr_dlschsim nr_ulschsim nr_dlsim nr_ulsim nr_pucchsim nr_prachsim" simlist="ldpctest dlsim ulsim polartest smallblocktest nr_pbchsim nr_dlschsim nr_ulschsim nr_dlsim nr_ulsim nr_pucchsim nr_prachsim"
for f in $simlist ; do for f in $simlist ; do
compilations \ compilations \
phy_simulators $f \ phy_simulators $f \
......
...@@ -30,7 +30,7 @@ ...@@ -30,7 +30,7 @@
#ifndef __NR_LDPC_BNPROC__H__ #ifndef __NR_LDPC_BNPROC__H__
#define __NR_LDPC_BNPROC__H__ #define __NR_LDPC_BNPROC__H__
#include <immintrin.h>
/** /**
\brief Performs first part of BN processing on the BN processing buffer and stores the results in the LLR results buffer. \brief Performs first part of BN processing on the BN processing buffer and stores the results in the LLR results buffer.
At every BN, the sum of the returned LLRs from the connected CNs and the LLR of the receiver input is computed. At every BN, the sum of the returned LLRs from the connected CNs and the LLR of the receiver input is computed.
......
...@@ -438,6 +438,7 @@ static inline void nrLDPC_cnProc_BG1(t_nrLDPC_lut* p_lut, t_nrLDPC_procBuf* p_pr ...@@ -438,6 +438,7 @@ static inline void nrLDPC_cnProc_BG1(t_nrLDPC_lut* p_lut, t_nrLDPC_procBuf* p_pr
ymm0 = p_cnProcBuf[lut_idxCnProcG3[j][1] + i]; ymm0 = p_cnProcBuf[lut_idxCnProcG3[j][1] + i];
min = _mm256_min_epu8(min, _mm256_abs_epi8(ymm0)); min = _mm256_min_epu8(min, _mm256_abs_epi8(ymm0));
sgn = _mm256_sign_epi8(sgn, ymm0); sgn = _mm256_sign_epi8(sgn, ymm0);
// Store result // Store result
min = _mm256_min_epu8(min, *p_maxLLR); // 128 in epi8 is -127 min = _mm256_min_epu8(min, *p_maxLLR); // 128 in epi8 is -127
......
...@@ -263,8 +263,8 @@ static inline uint32_t nrLDPC_decoder_core(int8_t* p_llr, int8_t* p_out, t_nrLDP ...@@ -263,8 +263,8 @@ static inline uint32_t nrLDPC_decoder_core(int8_t* p_llr, int8_t* p_out, t_nrLDP
uint8_t BG = p_decParams->BG; uint8_t BG = p_decParams->BG;
uint8_t numMaxIter = p_decParams->numMaxIter; uint8_t numMaxIter = p_decParams->numMaxIter;
e_nrLDPC_outMode outMode = p_decParams->outMode; e_nrLDPC_outMode outMode = p_decParams->outMode;
int8_t* cnProcBuf= p_procBuf->cnProcBuf; // int8_t* cnProcBuf= p_procBuf->cnProcBuf;
int8_t* cnProcBufRes=p_procBuf->cnProcBufRes; // int8_t* cnProcBufRes=p_procBuf->cnProcBufRes;
// Minimum number of iterations is 1 // Minimum number of iterations is 1
// 0 iterations means hard-decision on input LLRs // 0 iterations means hard-decision on input LLRs
...@@ -334,7 +334,7 @@ static inline uint32_t nrLDPC_decoder_core(int8_t* p_llr, int8_t* p_out, t_nrLDP ...@@ -334,7 +334,7 @@ static inline uint32_t nrLDPC_decoder_core(int8_t* p_llr, int8_t* p_out, t_nrLDP
case 384: case 384:
{ {
// nrLDPC_cnProc_BG1_Z384_13_AVX2(p_procBuf->cnProcBuf,p_procBuf->cnProcBufRes); // nrLDPC_cnProc_BG1_Z384_13_AVX2(p_procBuf->cnProcBuf,p_procBuf->cnProcBufRes);
nrLDPC_cnProc_BG1_Z384_13_AVX512(p_procBuf->cnProcBuf,p_procBuf->cnProcBufRes); //were test here nrLDPC_cnProc_BG1_Z384_13_AVX512(p_procBuf->cnProcBuf,p_procBuf->cnProcBufRes); //we test here
break; break;
} }
case 352: case 352:
...@@ -349,7 +349,7 @@ static inline uint32_t nrLDPC_decoder_core(int8_t* p_llr, int8_t* p_out, t_nrLDP ...@@ -349,7 +349,7 @@ static inline uint32_t nrLDPC_decoder_core(int8_t* p_llr, int8_t* p_out, t_nrLDP
nrLDPC_cnProc_BG1_Z320_13_AVX2(p_procBuf->cnProcBuf,p_procBuf->cnProcBufRes); nrLDPC_cnProc_BG1_Z320_13_AVX2(p_procBuf->cnProcBuf,p_procBuf->cnProcBufRes);
//nrLDPC_cnProc_BG1_Z320_13_AVX512(p_procBuf->cnProcBuf,p_procBuf->cnProcBufRes); //nrLDPC_cnProc_BG1_Z320_13_AVX512(p_procBuf->cnProcBuf,p_procBuf->cnProcBufRes);
break; break;
} }
case 288: case 288:
{ {
...@@ -1928,8 +1928,8 @@ static inline uint32_t nrLDPC_decoder_core(int8_t* p_llr, int8_t* p_out, t_nrLDP ...@@ -1928,8 +1928,8 @@ static inline uint32_t nrLDPC_decoder_core(int8_t* p_llr, int8_t* p_out, t_nrLDP
} }
case 352: case 352:
{ {
//nrLDPC_cnProc_BG1_Z352_13_AVX2(p_procBuf->cnProcBuf,p_procBuf->cnProcBufRes); nrLDPC_cnProc_BG1_Z352_13_AVX2(p_procBuf->cnProcBuf,p_procBuf->cnProcBufRes);
nrLDPC_cnProc_BG1_Z352_13_AVX512(p_procBuf->cnProcBuf,p_procBuf->cnProcBufRes); //nrLDPC_cnProc_BG1_Z352_13_AVX512(p_procBuf->cnProcBuf,p_procBuf->cnProcBufRes);
break; break;
} }
......
...@@ -69,8 +69,8 @@ void nrLDPC_cnProc_BG1_generator_AVX512(uint16_t Z,int R) ...@@ -69,8 +69,8 @@ void nrLDPC_cnProc_BG1_generator_AVX512(uint16_t Z,int R)
// Offsets are in units of bitOffsetInGroup (1*384/64)=6 // Offsets are in units of bitOffsetInGroup (1*384/64)=6
// Offsets are in units of bitOffsetInGroup (1*384/64)=6 // Offsets are in units of bitOffsetInGroup (1*384/64)=6
const uint8_t lut_idxCnProcG3[3][2] = {{6,12}, {0,12}, {0,6}}; // const uint8_t lut_idxCnProcG3[3][2] = {{6,12}, {0,12}, {0,6}};
const uint8_t lut_idxCnProcG3[3][2] = {{12,24}, {0,24}, {0,12}};
fprintf(fd," __m512i zmm0, min, sgn,ones,maxLLR;\n"); fprintf(fd," __m512i zmm0, min, sgn,ones,maxLLR;\n");
fprintf(fd," ones = _mm512_set1_epi8((char)1);\n"); fprintf(fd," ones = _mm512_set1_epi8((char)1);\n");
...@@ -80,10 +80,17 @@ void nrLDPC_cnProc_BG1_generator_AVX512(uint16_t Z,int R) ...@@ -80,10 +80,17 @@ void nrLDPC_cnProc_BG1_generator_AVX512(uint16_t Z,int R)
{ {
// Number of groups of 64 CNs for parallel processing // Number of groups of 64 CNs for parallel processing
// Ceil for values not divisible by 64 // Ceil for values not divisible by 64
<<<<<<< HEAD
M = (lut_numCnInCnGroups[0]*Z + 63)>>31; M = (lut_numCnInCnGroups[0]*Z + 63)>>31;
// Set the offset to each bit within a group in terms of 64 Byte // Set the offset to each bit within a group in terms of 64 Byte
bitOffsetInGroup = (lut_numCnInCnGroups_BG1_R13[0]*NR_LDPC_ZMAX)>>31; bitOffsetInGroup = (lut_numCnInCnGroups_BG1_R13[0]*NR_LDPC_ZMAX)>>31;
=======
M = (lut_numCnInCnGroups[0]*Z + 31)>>7;
// Set the offset to each bit within a group in terms of 64 Byte
bitOffsetInGroup = (lut_numCnInCnGroups_BG1_R13[0]*NR_LDPC_ZMAX)>>7;
>>>>>>> 6d9ceaa712033dc4f28050886be43572df7c2b68
// Set pointers to start of group 3 // Set pointers to start of group 3
...@@ -104,7 +111,11 @@ void nrLDPC_cnProc_BG1_generator_AVX512(uint16_t Z,int R) ...@@ -104,7 +111,11 @@ void nrLDPC_cnProc_BG1_generator_AVX512(uint16_t Z,int R)
fprintf(fd," for (int i=0;i<%d;i+=2) {\n",M); fprintf(fd," for (int i=0;i<%d;i+=2) {\n",M);
// Abs and sign of 64 CNs (first BN) // Abs and sign of 64 CNs (first BN)
// zmm0 = p_cnProcBuf[lut_idxCnProcG3[j][0] + i]; // zmm0 = p_cnProcBuf[lut_idxCnProcG3[j][0] + i];
<<<<<<< HEAD
fprintf(fd," zmm0 = ((__m512i*)cnProcBuf)[%d+i];\n",(lut_startAddrCnGroups[0]>>31)+lut_idxCnProcG3[j][0]); fprintf(fd," zmm0 = ((__m512i*)cnProcBuf)[%d+i];\n",(lut_startAddrCnGroups[0]>>31)+lut_idxCnProcG3[j][0]);
=======
fprintf(fd," zmm0 = ((__m512i*)cnProcBuf)[%d+i];\n",(lut_startAddrCnGroups[0]>>7)+lut_idxCnProcG3[j][0]);
>>>>>>> 6d9ceaa712033dc4f28050886be43572df7c2b68
// sgn = _mm512_sign_epi16(ones, zmm0); // sgn = _mm512_sign_epi16(ones, zmm0);
fprintf(fd," sgn = _mm512_sign_epi16(ones, zmm0);\n"); fprintf(fd," sgn = _mm512_sign_epi16(ones, zmm0);\n");
// min = _mm512_abs_epi8(zmm0); // min = _mm512_abs_epi8(zmm0);
...@@ -112,7 +123,11 @@ void nrLDPC_cnProc_BG1_generator_AVX512(uint16_t Z,int R) ...@@ -112,7 +123,11 @@ void nrLDPC_cnProc_BG1_generator_AVX512(uint16_t Z,int R)
// 32 CNs of second BN // 32 CNs of second BN
// zmm0 = p_cnProcBuf[lut_idxCnProcG3[j][1] + i]; // zmm0 = p_cnProcBuf[lut_idxCnProcG3[j][1] + i];
<<<<<<< HEAD
fprintf(fd," zmm0 = ((__m512i*)cnProcBuf)[%d+i];\n",(lut_startAddrCnGroups[0]>>31)+lut_idxCnProcG3[j][1]); fprintf(fd," zmm0 = ((__m512i*)cnProcBuf)[%d+i];\n",(lut_startAddrCnGroups[0]>>31)+lut_idxCnProcG3[j][1]);
=======
fprintf(fd," zmm0 = ((__m512i*)cnProcBuf)[%d+i];\n",(lut_startAddrCnGroups[0]>>7)+lut_idxCnProcG3[j][1]);
>>>>>>> 6d9ceaa712033dc4f28050886be43572df7c2b68
// min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); // min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0));
fprintf(fd," min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0));\n"); fprintf(fd," min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0));\n");
...@@ -125,11 +140,19 @@ void nrLDPC_cnProc_BG1_generator_AVX512(uint16_t Z,int R) ...@@ -125,11 +140,19 @@ void nrLDPC_cnProc_BG1_generator_AVX512(uint16_t Z,int R)
fprintf(fd," min = _mm512_min_epu8(min, maxLLR);\n"); fprintf(fd," min = _mm512_min_epu8(min, maxLLR);\n");
// *p_cnProcBufResBit = _mm512_sign_epi16(min, sgn); // *p_cnProcBufResBit = _mm512_sign_epi16(min, sgn);
// p_cnProcBufResBit++; // p_cnProcBufResBit++;
<<<<<<< HEAD
fprintf(fd," ((__m512i*)cnProcBufRes)[%d+i] = _mm512_sign_epi16(min, sgn);\n",(lut_startAddrCnGroups[0]>>31)+(j*bitOffsetInGroup)); fprintf(fd," ((__m512i*)cnProcBufRes)[%d+i] = _mm512_sign_epi16(min, sgn);\n",(lut_startAddrCnGroups[0]>>31)+(j*bitOffsetInGroup));
// Abs and sign of 64 CNs (first BN) // Abs and sign of 64 CNs (first BN)
// zmm0 = p_cnProcBuf[lut_idxCnProcG3[j][0] + i]; // zmm0 = p_cnProcBuf[lut_idxCnProcG3[j][0] + i];
fprintf(fd," zmm0 = ((__m512i*)cnProcBuf)[%d+i];\n",(lut_startAddrCnGroups[0]>>31)+lut_idxCnProcG3[j][0]+1); fprintf(fd," zmm0 = ((__m512i*)cnProcBuf)[%d+i];\n",(lut_startAddrCnGroups[0]>>31)+lut_idxCnProcG3[j][0]+1);
=======
fprintf(fd," ((__m512i*)cnProcBufRes)[%d+i] = _mm512_sign_epi16(min, sgn);\n",(lut_startAddrCnGroups[0]>>7)+(j*bitOffsetInGroup));
// Abs and sign of 64 CNs (first BN)
// zmm0 = p_cnProcBuf[lut_idxCnProcG3[j][0] + i];
fprintf(fd," zmm0 = ((__m512i*)cnProcBuf)[%d+i];\n",(lut_startAddrCnGroups[0]>>7)+lut_idxCnProcG3[j][0]+1);
>>>>>>> 6d9ceaa712033dc4f28050886be43572df7c2b68
// sgn = _mm512_sign_epi16(ones, zmm0); // sgn = _mm512_sign_epi16(ones, zmm0);
fprintf(fd," sgn = _mm512_sign_epi16(ones, zmm0);\n"); fprintf(fd," sgn = _mm512_sign_epi16(ones, zmm0);\n");
// min = _mm512_abs_epi8(zmm0); // min = _mm512_abs_epi8(zmm0);
...@@ -137,7 +160,11 @@ void nrLDPC_cnProc_BG1_generator_AVX512(uint16_t Z,int R) ...@@ -137,7 +160,11 @@ void nrLDPC_cnProc_BG1_generator_AVX512(uint16_t Z,int R)
// 32 CNs of second BN // 32 CNs of second BN
// zmm0 = p_cnProcBuf[lut_idxCnProcG3[j][1] + i]; // zmm0 = p_cnProcBuf[lut_idxCnProcG3[j][1] + i];
<<<<<<< HEAD
fprintf(fd," zmm0 = ((__m512i*)cnProcBuf)[%d+i];\n",(lut_startAddrCnGroups[0]>>31)+lut_idxCnProcG3[j][1]+1); fprintf(fd," zmm0 = ((__m512i*)cnProcBuf)[%d+i];\n",(lut_startAddrCnGroups[0]>>31)+lut_idxCnProcG3[j][1]+1);
=======
fprintf(fd," zmm0 = ((__m512i*)cnProcBuf)[%d+i];\n",(lut_startAddrCnGroups[0]>>7)+lut_idxCnProcG3[j][1]+1);
>>>>>>> 6d9ceaa712033dc4f28050886be43572df7c2b68
// min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); // min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0));
fprintf(fd," min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0));\n"); fprintf(fd," min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0));\n");
...@@ -150,7 +177,11 @@ void nrLDPC_cnProc_BG1_generator_AVX512(uint16_t Z,int R) ...@@ -150,7 +177,11 @@ void nrLDPC_cnProc_BG1_generator_AVX512(uint16_t Z,int R)
fprintf(fd," min = _mm512_min_epu8(min, maxLLR);\n"); fprintf(fd," min = _mm512_min_epu8(min, maxLLR);\n");
// *p_cnProcBufResBit = _mm512_sign_epi16(min, sgn); // *p_cnProcBufResBit = _mm512_sign_epi16(min, sgn);
// p_cnProcBufResBit++; // p_cnProcBufResBit++;
<<<<<<< HEAD
fprintf(fd," ((__m512i*)cnProcBufRes)[%d+i] = _mm512_sign_epi16(min, sgn);\n",(lut_startAddrCnGroups[0]>>31)+(j*bitOffsetInGroup)+1); fprintf(fd," ((__m512i*)cnProcBufRes)[%d+i] = _mm512_sign_epi16(min, sgn);\n",(lut_startAddrCnGroups[0]>>31)+(j*bitOffsetInGroup)+1);
=======
fprintf(fd," ((__m512i*)cnProcBufRes)[%d+i] = _mm512_sign_epi16(min, sgn);\n",(lut_startAddrCnGroups[0]>>7)+(j*bitOffsetInGroup)+1);
>>>>>>> 6d9ceaa712033dc4f28050886be43572df7c2b68
fprintf(fd," }\n"); fprintf(fd," }\n");
} }
...@@ -160,16 +191,24 @@ void nrLDPC_cnProc_BG1_generator_AVX512(uint16_t Z,int R) ...@@ -160,16 +191,24 @@ void nrLDPC_cnProc_BG1_generator_AVX512(uint16_t Z,int R)
// Process group with 4 BNs // Process group with 4 BNs
fprintf(fd,"//Process group with 4 BNs\n"); fprintf(fd,"//Process group with 4 BNs\n");
// Offset is 5*384/64 = 30 // Offset is 5*384/64 = 30
const uint8_t lut_idxCnProcG4[4][3] = {{30,60,90}, {0,60,90}, {0,30,90}, {0,30,60}}; // const uint8_t lut_idxCnProcG4[4][3] = {{30,60,90}, {0,60,90}, {0,30,90}, {0,30,60}};
const uint8_t lut_idxCnProcG4[4][3] = {{60,120,180}, {0,120,180}, {0,60,180}, {0,60,120}};
if (lut_numCnInCnGroups[1] > 0) if (lut_numCnInCnGroups[1] > 0)
{ {
// Number of groups of 64 CNs for parallel processing // Number of groups of 64 CNs for parallel processing
// Ceil for values not divisible by 64 // Ceil for values not divisible by 64
<<<<<<< HEAD
M = (lut_numCnInCnGroups[1]*Z + 63)>>31; M = (lut_numCnInCnGroups[1]*Z + 63)>>31;
// Set the offset to each bit within a group in terms of 64 Byte // Set the offset to each bit within a group in terms of 64 Byte
bitOffsetInGroup = (lut_numCnInCnGroups_BG1_R13[1]*NR_LDPC_ZMAX)>>31; bitOffsetInGroup = (lut_numCnInCnGroups_BG1_R13[1]*NR_LDPC_ZMAX)>>31;
=======
M = (lut_numCnInCnGroups[1]*Z + 31)>>7;
// Set the offset to each bit within a group in terms of 64 Byte
bitOffsetInGroup = (lut_numCnInCnGroups_BG1_R13[1]*NR_LDPC_ZMAX)>>7;
>>>>>>> 6d9ceaa712033dc4f28050886be43572df7c2b68
// Set pointers to start of group 4 // Set pointers to start of group 4
...@@ -188,7 +227,11 @@ void nrLDPC_cnProc_BG1_generator_AVX512(uint16_t Z,int R) ...@@ -188,7 +227,11 @@ void nrLDPC_cnProc_BG1_generator_AVX512(uint16_t Z,int R)
fprintf(fd," for (int i=0;i<%d;i++) {\n",M); fprintf(fd," for (int i=0;i<%d;i++) {\n",M);
// Abs and sign of 64 CNs (first BN) // Abs and sign of 64 CNs (first BN)
// zmm0 = p_cnProcBuf[lut_idxCnProcG3[j][0] + i]; // zmm0 = p_cnProcBuf[lut_idxCnProcG3[j][0] + i];
<<<<<<< HEAD
fprintf(fd," zmm0 = ((__m512i*)cnProcBuf)[%d+i];\n",(lut_startAddrCnGroups[1]>>31)+lut_idxCnProcG4[j][0]); fprintf(fd," zmm0 = ((__m512i*)cnProcBuf)[%d+i];\n",(lut_startAddrCnGroups[1]>>31)+lut_idxCnProcG4[j][0]);
=======
fprintf(fd," zmm0 = ((__m512i*)cnProcBuf)[%d+i];\n",(lut_startAddrCnGroups[1]>>7)+lut_idxCnProcG4[j][0]);
>>>>>>> 6d9ceaa712033dc4f28050886be43572df7c2b68
// sgn = _mm512_sign_epi16(ones, zmm0); // sgn = _mm512_sign_epi16(ones, zmm0);
fprintf(fd," sgn = _mm512_sign_epi16(ones, zmm0);\n"); fprintf(fd," sgn = _mm512_sign_epi16(ones, zmm0);\n");
// min = _mm512_abs_epi8(zmm0); // min = _mm512_abs_epi8(zmm0);
...@@ -198,7 +241,11 @@ void nrLDPC_cnProc_BG1_generator_AVX512(uint16_t Z,int R) ...@@ -198,7 +241,11 @@ void nrLDPC_cnProc_BG1_generator_AVX512(uint16_t Z,int R)
// Loop over BNs // Loop over BNs
for (k=1; k<3; k++) for (k=1; k<3; k++)
{ {
<<<<<<< HEAD
fprintf(fd," zmm0 = ((__m512i*)cnProcBuf)[%d+i];\n",(lut_startAddrCnGroups[1]>>31)+lut_idxCnProcG4[j][k]); fprintf(fd," zmm0 = ((__m512i*)cnProcBuf)[%d+i];\n",(lut_startAddrCnGroups[1]>>31)+lut_idxCnProcG4[j][k]);
=======
fprintf(fd," zmm0 = ((__m512i*)cnProcBuf)[%d+i];\n",(lut_startAddrCnGroups[1]>>7)+lut_idxCnProcG4[j][k]);
>>>>>>> 6d9ceaa712033dc4f28050886be43572df7c2b68
// min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); // min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0));
fprintf(fd," min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0));\n"); fprintf(fd," min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0));\n");
...@@ -212,7 +259,11 @@ void nrLDPC_cnProc_BG1_generator_AVX512(uint16_t Z,int R) ...@@ -212,7 +259,11 @@ void nrLDPC_cnProc_BG1_generator_AVX512(uint16_t Z,int R)
fprintf(fd," min = _mm512_min_epu8(min, maxLLR);\n"); fprintf(fd," min = _mm512_min_epu8(min, maxLLR);\n");
// *p_cnProcBufResBit = _mm512_sign_epi16(min, sgn); // *p_cnProcBufResBit = _mm512_sign_epi16(min, sgn);
// p_cnProcBufResBit++; // p_cnProcBufResBit++;
<<<<<<< HEAD
fprintf(fd," ((__m512i*)cnProcBufRes)[%d+i] = _mm512_sign_epi16(min, sgn);\n",(lut_startAddrCnGroups[1]>>31)+(j*bitOffsetInGroup)); fprintf(fd," ((__m512i*)cnProcBufRes)[%d+i] = _mm512_sign_epi16(min, sgn);\n",(lut_startAddrCnGroups[1]>>31)+(j*bitOffsetInGroup));
=======
fprintf(fd," ((__m512i*)cnProcBufRes)[%d+i] = _mm512_sign_epi16(min, sgn);\n",(lut_startAddrCnGroups[1]>>7)+(j*bitOffsetInGroup));
>>>>>>> 6d9ceaa712033dc4f28050886be43572df7c2b68
fprintf(fd," }\n"); fprintf(fd," }\n");
} }
} }
...@@ -222,18 +273,28 @@ void nrLDPC_cnProc_BG1_generator_AVX512(uint16_t Z,int R) ...@@ -222,18 +273,28 @@ void nrLDPC_cnProc_BG1_generator_AVX512(uint16_t Z,int R)
// Process group with 5 BNs // Process group with 5 BNs
fprintf(fd,"//Process group with 5 BNs\n"); fprintf(fd,"//Process group with 5 BNs\n");
// Offset is 18*384/64 = 216 // Offset is 18*384/64 = 216
const uint16_t lut_idxCnProcG5[5][4] = {{108,216,324,432}, {0,216,324,432}, //const uint16_t lut_idxCnProcG5[5][4] = {{108,216,324,432}, {0,216,324,432},
{0,108,324,432}, {0,108,216,432}, {0,108,216,324}}; // {0,108,324,432}, {0,108,216,432}, {0,108,216,324}};
const uint16_t lut_idxCnProcG5[5][4] = {{216,432,648,864}, {0,432,648,864},
{0,216,648,864}, {0,216,432,864}, {0,216,432,648}};
if (lut_numCnInCnGroups[2] > 0) if (lut_numCnInCnGroups[2] > 0)
{ {
// Number of groups of 64 CNs for parallel processing // Number of groups of 64 CNs for parallel processing
// Ceil for values not divisible by 64 // Ceil for values not divisible by 64
<<<<<<< HEAD
M = (lut_numCnInCnGroups[2]*Z + 63)>>31; M = (lut_numCnInCnGroups[2]*Z + 63)>>31;
// Set the offset to each bit within a group in terms of 64 Byte // Set the offset to each bit within a group in terms of 64 Byte
bitOffsetInGroup = (lut_numCnInCnGroups_BG1_R13[2]*NR_LDPC_ZMAX)>>31; bitOffsetInGroup = (lut_numCnInCnGroups_BG1_R13[2]*NR_LDPC_ZMAX)>>31;
=======
M = (lut_numCnInCnGroups[2]*Z + 31)>>7;
// Set the offset to each bit within a group in terms of 64 Byte
bitOffsetInGroup = (lut_numCnInCnGroups_BG1_R13[2]*NR_LDPC_ZMAX)>>7;
>>>>>>> 6d9ceaa712033dc4f28050886be43572df7c2b68
// Set pointers to start of group 4 // Set pointers to start of group 4
...@@ -253,7 +314,11 @@ void nrLDPC_cnProc_BG1_generator_AVX512(uint16_t Z,int R) ...@@ -253,7 +314,11 @@ void nrLDPC_cnProc_BG1_generator_AVX512(uint16_t Z,int R)
fprintf(fd," for (int i=0;i<%d;i++) {\n",M); fprintf(fd," for (int i=0;i<%d;i++) {\n",M);
// Abs and sign of 64 CNs (first BN) // Abs and sign of 64 CNs (first BN)
// zmm0 = p_cnProcBuf[lut_idxCnProcG3[j][0] + i]; // zmm0 = p_cnProcBuf[lut_idxCnProcG3[j][0] + i];
<<<<<<< HEAD
fprintf(fd," zmm0 = ((__m512i*)cnProcBuf)[%d+i];\n",(lut_startAddrCnGroups[2]>>31)+lut_idxCnProcG5[j][0]); fprintf(fd," zmm0 = ((__m512i*)cnProcBuf)[%d+i];\n",(lut_startAddrCnGroups[2]>>31)+lut_idxCnProcG5[j][0]);
=======
fprintf(fd," zmm0 = ((__m512i*)cnProcBuf)[%d+i];\n",(lut_startAddrCnGroups[2]>>7)+lut_idxCnProcG5[j][0]);
>>>>>>> 6d9ceaa712033dc4f28050886be43572df7c2b68
// sgn = _mm512_sign_epi16(ones, zmm0); // sgn = _mm512_sign_epi16(ones, zmm0);
fprintf(fd," sgn = _mm512_sign_epi16(ones, zmm0);\n"); fprintf(fd," sgn = _mm512_sign_epi16(ones, zmm0);\n");
// min = _mm512_abs_epi8(zmm0); // min = _mm512_abs_epi8(zmm0);
...@@ -263,7 +328,11 @@ void nrLDPC_cnProc_BG1_generator_AVX512(uint16_t Z,int R) ...@@ -263,7 +328,11 @@ void nrLDPC_cnProc_BG1_generator_AVX512(uint16_t Z,int R)
// Loop over BNs // Loop over BNs
for (k=1; k<4; k++) for (k=1; k<4; k++)
{ {
<<<<<<< HEAD
fprintf(fd," zmm0 = ((__m512i*)cnProcBuf)[%d+i];\n",(lut_startAddrCnGroups[2]>>31)+lut_idxCnProcG5[j][k]); fprintf(fd," zmm0 = ((__m512i*)cnProcBuf)[%d+i];\n",(lut_startAddrCnGroups[2]>>31)+lut_idxCnProcG5[j][k]);
=======
fprintf(fd," zmm0 = ((__m512i*)cnProcBuf)[%d+i];\n",(lut_startAddrCnGroups[2]>>7)+lut_idxCnProcG5[j][k]);
>>>>>>> 6d9ceaa712033dc4f28050886be43572df7c2b68
// min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); // min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0));
fprintf(fd," min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0));\n"); fprintf(fd," min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0));\n");
...@@ -277,7 +346,11 @@ void nrLDPC_cnProc_BG1_generator_AVX512(uint16_t Z,int R) ...@@ -277,7 +346,11 @@ void nrLDPC_cnProc_BG1_generator_AVX512(uint16_t Z,int R)
fprintf(fd," min = _mm512_min_epu8(min, maxLLR);\n"); fprintf(fd," min = _mm512_min_epu8(min, maxLLR);\n");
// *p_cnProcBufResBit = _mm512_sign_epi16(min, sgn); // *p_cnProcBufResBit = _mm512_sign_epi16(min, sgn);
// p_cnProcBufResBit++; // p_cnProcBufResBit++;
<<<<<<< HEAD
fprintf(fd," ((__m512i*)cnProcBufRes)[%d+i] = _mm512_sign_epi16(min, sgn);\n",(lut_startAddrCnGroups[2]>>31)+(j*bitOffsetInGroup)); fprintf(fd," ((__m512i*)cnProcBufRes)[%d+i] = _mm512_sign_epi16(min, sgn);\n",(lut_startAddrCnGroups[2]>>31)+(j*bitOffsetInGroup));
=======
fprintf(fd," ((__m512i*)cnProcBufRes)[%d+i] = _mm512_sign_epi16(min, sgn);\n",(lut_startAddrCnGroups[2]>>7)+(j*bitOffsetInGroup));
>>>>>>> 6d9ceaa712033dc4f28050886be43572df7c2b68
fprintf(fd," }\n"); fprintf(fd," }\n");
} }
} }
...@@ -286,19 +359,28 @@ void nrLDPC_cnProc_BG1_generator_AVX512(uint16_t Z,int R) ...@@ -286,19 +359,28 @@ void nrLDPC_cnProc_BG1_generator_AVX512(uint16_t Z,int R)
// Process group with 6 BNs // Process group with 6 BNs
fprintf(fd,"//Process group with 6 BNs\n"); fprintf(fd,"//Process group with 6 BNs\n");
// Offset is 8*384/64 = 48 // Offset is 8*384/64 = 48
const uint16_t lut_idxCnProcG6[6][5] = {{48,96,144,192,240}, {0,96,144,192,240}, /* const uint16_t lut_idxCnProcG6[6][5] = {{48,96,144,192,240}, {0,96,144,192,240},
{0,48,144,192,240}, {0,48,96,192,240}, {0,48,144,192,240}, {0,48,96,192,240},
{0,48,96,144,240}, {0,48,96,144,192}}; {0,48,96,144,240}, {0,48,96,144,192}};*/
const uint16_t lut_idxCnProcG6[6][5] = {{96,192,288,384,480}, {0,192,288,384,480},
{0,96,288,384,480}, {0,96,192,384,480},
{0,96,192,288,480}, {0,96,192,288,384}};
if (lut_numCnInCnGroups[3] > 0) if (lut_numCnInCnGroups[3] > 0)
{ {
// Number of groups of 64 CNs for parallel processing // Number of groups of 64 CNs for parallel processing
// Ceil for values not divisible by 64 // Ceil for values not divisible by 64
<<<<<<< HEAD
M = (lut_numCnInCnGroups[3]*Z + 63)>>31; M = (lut_numCnInCnGroups[3]*Z + 63)>>31;
// Set the offset to each bit within a group in terms of 64 Byte // Set the offset to each bit within a group in terms of 64 Byte
bitOffsetInGroup = (lut_numCnInCnGroups_BG1_R13[3]*NR_LDPC_ZMAX)>>31; bitOffsetInGroup = (lut_numCnInCnGroups_BG1_R13[3]*NR_LDPC_ZMAX)>>31;
=======
M = (lut_numCnInCnGroups[3]*Z + 31)>>7;
// Set the offset to each bit within a group in terms of 64 Byte
bitOffsetInGroup = (lut_numCnInCnGroups_BG1_R13[3]*NR_LDPC_ZMAX)>>7;
>>>>>>> 6d9ceaa712033dc4f28050886be43572df7c2b68
// Set pointers to start of group 4 // Set pointers to start of group 4
...@@ -318,7 +400,11 @@ void nrLDPC_cnProc_BG1_generator_AVX512(uint16_t Z,int R) ...@@ -318,7 +400,11 @@ void nrLDPC_cnProc_BG1_generator_AVX512(uint16_t Z,int R)
fprintf(fd," for (int i=0;i<%d;i++) {\n",M); fprintf(fd," for (int i=0;i<%d;i++) {\n",M);
// Abs and sign of 64 CNs (first BN) // Abs and sign of 64 CNs (first BN)
// zmm0 = p_cnProcBuf[lut_idxCnProcG3[j][0] + i]; // zmm0 = p_cnProcBuf[lut_idxCnProcG3[j][0] + i];
<<<<<<< HEAD
fprintf(fd," zmm0 = ((__m512i*)cnProcBuf)[%d+i];\n",(lut_startAddrCnGroups[3]>>31)+lut_idxCnProcG6[j][0]); fprintf(fd," zmm0 = ((__m512i*)cnProcBuf)[%d+i];\n",(lut_startAddrCnGroups[3]>>31)+lut_idxCnProcG6[j][0]);
=======
fprintf(fd," zmm0 = ((__m512i*)cnProcBuf)[%d+i];\n",(lut_startAddrCnGroups[3]>>7)+lut_idxCnProcG6[j][0]);
>>>>>>> 6d9ceaa712033dc4f28050886be43572df7c2b68
// sgn = _mm512_sign_epi16(ones, zmm0); // sgn = _mm512_sign_epi16(ones, zmm0);
fprintf(fd," sgn = _mm512_sign_epi16(ones, zmm0);\n"); fprintf(fd," sgn = _mm512_sign_epi16(ones, zmm0);\n");
// min = _mm512_abs_epi8(zmm0); // min = _mm512_abs_epi8(zmm0);
...@@ -328,7 +414,11 @@ void nrLDPC_cnProc_BG1_generator_AVX512(uint16_t Z,int R) ...@@ -328,7 +414,11 @@ void nrLDPC_cnProc_BG1_generator_AVX512(uint16_t Z,int R)
// Loop over BNs // Loop over BNs
for (k=1; k<5; k++) for (k=1; k<5; k++)
{ {
<<<<<<< HEAD
fprintf(fd," zmm0 = ((__m512i*)cnProcBuf)[%d+i];\n",(lut_startAddrCnGroups[3]>>31)+lut_idxCnProcG6[j][k]); fprintf(fd," zmm0 = ((__m512i*)cnProcBuf)[%d+i];\n",(lut_startAddrCnGroups[3]>>31)+lut_idxCnProcG6[j][k]);
=======
fprintf(fd," zmm0 = ((__m512i*)cnProcBuf)[%d+i];\n",(lut_startAddrCnGroups[3]>>7)+lut_idxCnProcG6[j][k]);
>>>>>>> 6d9ceaa712033dc4f28050886be43572df7c2b68
// min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); // min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0));
fprintf(fd," min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0));\n"); fprintf(fd," min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0));\n");
...@@ -342,7 +432,11 @@ void nrLDPC_cnProc_BG1_generator_AVX512(uint16_t Z,int R) ...@@ -342,7 +432,11 @@ void nrLDPC_cnProc_BG1_generator_AVX512(uint16_t Z,int R)
fprintf(fd," min = _mm512_min_epu8(min, maxLLR);\n"); fprintf(fd," min = _mm512_min_epu8(min, maxLLR);\n");
// *p_cnProcBufResBit = _mm512_sign_epi16(min, sgn); // *p_cnProcBufResBit = _mm512_sign_epi16(min, sgn);
// p_cnProcBufResBit++; // p_cnProcBufResBit++;
<<<<<<< HEAD
fprintf(fd," ((__m512i*)cnProcBufRes)[%d+i] = _mm512_sign_epi16(min, sgn);\n",(lut_startAddrCnGroups[3]>>31)+(j*bitOffsetInGroup)); fprintf(fd," ((__m512i*)cnProcBufRes)[%d+i] = _mm512_sign_epi16(min, sgn);\n",(lut_startAddrCnGroups[3]>>31)+(j*bitOffsetInGroup));
=======
fprintf(fd," ((__m512i*)cnProcBufRes)[%d+i] = _mm512_sign_epi16(min, sgn);\n",(lut_startAddrCnGroups[3]>>7)+(j*bitOffsetInGroup));
>>>>>>> 6d9ceaa712033dc4f28050886be43572df7c2b68
fprintf(fd," }\n"); fprintf(fd," }\n");
} }
} }
...@@ -352,20 +446,32 @@ void nrLDPC_cnProc_BG1_generator_AVX512(uint16_t Z,int R) ...@@ -352,20 +446,32 @@ void nrLDPC_cnProc_BG1_generator_AVX512(uint16_t Z,int R)
// Process group with 7 BNs // Process group with 7 BNs
fprintf(fd,"//Process group with 7 BNs\n"); fprintf(fd,"//Process group with 7 BNs\n");
// Offset is 5*384/64 = 30 // Offset is 5*384/64 = 30
const uint16_t lut_idxCnProcG7[7][6] = {{30,60,90,120,150,180}, {0,60,90,120,150,180}, /* const uint16_t lut_idxCnProcG7[7][6] = {{30,60,90,120,150,180}, {0,60,90,120,150,180},
{0,30,90,120,150,180}, {0,30,60,120,150,180}, {0,30,90,120,150,180}, {0,30,60,120,150,180},
{0,30,60,90,150,180}, {0,30,60,90,120,180}, {0,30,60,90,150,180}, {0,30,60,90,120,180},
{0,30,60,90,120,150}}; {0,30,60,90,120,150}};*/
const uint16_t lut_idxCnProcG7[7][6] = {{60,120,180,240,300,360}, {0,120,180,240,300,360},
{0,60,180,240,300,360}, {0,60,120,240,300,360},
{0,60,120,180,300,360}, {0,60,120,180,240,360},
{0,60,120,180,240,300}};
if (lut_numCnInCnGroups[4] > 0) if (lut_numCnInCnGroups[4] > 0)
{ {
// Number of groups of 64 CNs for parallel processing // Number of groups of 64 CNs for parallel processing
// Ceil for values not divisible by 64 // Ceil for values not divisible by 64
<<<<<<< HEAD
M = (lut_numCnInCnGroups[4]*Z + 63)>>31; M = (lut_numCnInCnGroups[4]*Z + 63)>>31;
// Set the offset to each bit within a group in terms of 64 Byte // Set the offset to each bit within a group in terms of 64 Byte
bitOffsetInGroup = (lut_numCnInCnGroups_BG1_R13[4]*NR_LDPC_ZMAX)>>31; bitOffsetInGroup = (lut_numCnInCnGroups_BG1_R13[4]*NR_LDPC_ZMAX)>>31;
=======
M = (lut_numCnInCnGroups[4]*Z + 31)>>7;
// Set the offset to each bit within a group in terms of 64 Byte
bitOffsetInGroup = (lut_numCnInCnGroups_BG1_R13[4]*NR_LDPC_ZMAX)>>7;
>>>>>>> 6d9ceaa712033dc4f28050886be43572df7c2b68
// Set pointers to start of group 4 // Set pointers to start of group 4
...@@ -385,7 +491,11 @@ void nrLDPC_cnProc_BG1_generator_AVX512(uint16_t Z,int R) ...@@ -385,7 +491,11 @@ void nrLDPC_cnProc_BG1_generator_AVX512(uint16_t Z,int R)
fprintf(fd," for (int i=0;i<%d;i++) {\n",M); fprintf(fd," for (int i=0;i<%d;i++) {\n",M);
// Abs and sign of 64 CNs (first BN) // Abs and sign of 64 CNs (first BN)
// zmm0 = p_cnProcBuf[lut_idxCnProcG3[j][0] + i]; // zmm0 = p_cnProcBuf[lut_idxCnProcG3[j][0] + i];
<<<<<<< HEAD
fprintf(fd," zmm0 = ((__m512i*)cnProcBuf)[%d+i];\n",(lut_startAddrCnGroups[4]>>31)+lut_idxCnProcG7[j][0]); fprintf(fd," zmm0 = ((__m512i*)cnProcBuf)[%d+i];\n",(lut_startAddrCnGroups[4]>>31)+lut_idxCnProcG7[j][0]);
=======
fprintf(fd," zmm0 = ((__m512i*)cnProcBuf)[%d+i];\n",(lut_startAddrCnGroups[4]>>7)+lut_idxCnProcG7[j][0]);
>>>>>>> 6d9ceaa712033dc4f28050886be43572df7c2b68
// sgn = _mm512_sign_epi16(ones, zmm0); // sgn = _mm512_sign_epi16(ones, zmm0);
fprintf(fd," sgn = _mm512_sign_epi16(ones, zmm0);\n"); fprintf(fd," sgn = _mm512_sign_epi16(ones, zmm0);\n");
// min = _mm512_abs_epi8(zmm0); // min = _mm512_abs_epi8(zmm0);
...@@ -395,7 +505,11 @@ void nrLDPC_cnProc_BG1_generator_AVX512(uint16_t Z,int R) ...@@ -395,7 +505,11 @@ void nrLDPC_cnProc_BG1_generator_AVX512(uint16_t Z,int R)
// Loop over BNs // Loop over BNs
for (k=1; k<6; k++) for (k=1; k<6; k++)
{ {
<<<<<<< HEAD
fprintf(fd," zmm0 = ((__m512i*)cnProcBuf)[%d+i];\n",(lut_startAddrCnGroups[4]>>31)+lut_idxCnProcG7[j][k]); fprintf(fd," zmm0 = ((__m512i*)cnProcBuf)[%d+i];\n",(lut_startAddrCnGroups[4]>>31)+lut_idxCnProcG7[j][k]);
=======
fprintf(fd," zmm0 = ((__m512i*)cnProcBuf)[%d+i];\n",(lut_startAddrCnGroups[4]>>7)+lut_idxCnProcG7[j][k]);
>>>>>>> 6d9ceaa712033dc4f28050886be43572df7c2b68
// min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); // min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0));
fprintf(fd," min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0));\n"); fprintf(fd," min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0));\n");
...@@ -409,7 +523,11 @@ void nrLDPC_cnProc_BG1_generator_AVX512(uint16_t Z,int R) ...@@ -409,7 +523,11 @@ void nrLDPC_cnProc_BG1_generator_AVX512(uint16_t Z,int R)
fprintf(fd," min = _mm512_min_epu8(min, maxLLR);\n"); fprintf(fd," min = _mm512_min_epu8(min, maxLLR);\n");
// *p_cnProcBufResBit = _mm512_sign_epi16(min, sgn); // *p_cnProcBufResBit = _mm512_sign_epi16(min, sgn);
// p_cnProcBufResBit++; // p_cnProcBufResBit++;
<<<<<<< HEAD
fprintf(fd," ((__m512i*)cnProcBufRes)[%d+i] = _mm512_sign_epi16(min, sgn);\n",(lut_startAddrCnGroups[4]>>31)+(j*bitOffsetInGroup)); fprintf(fd," ((__m512i*)cnProcBufRes)[%d+i] = _mm512_sign_epi16(min, sgn);\n",(lut_startAddrCnGroups[4]>>31)+(j*bitOffsetInGroup));
=======
fprintf(fd," ((__m512i*)cnProcBufRes)[%d+i] = _mm512_sign_epi16(min, sgn);\n",(lut_startAddrCnGroups[4]>>7)+(j*bitOffsetInGroup));
>>>>>>> 6d9ceaa712033dc4f28050886be43572df7c2b68
fprintf(fd," }\n"); fprintf(fd," }\n");
} }
} }
...@@ -419,10 +537,15 @@ void nrLDPC_cnProc_BG1_generator_AVX512(uint16_t Z,int R) ...@@ -419,10 +537,15 @@ void nrLDPC_cnProc_BG1_generator_AVX512(uint16_t Z,int R)
// Process group with 8 BNs // Process group with 8 BNs
fprintf(fd,"//Process group with 8 BNs\n"); fprintf(fd,"//Process group with 8 BNs\n");
// Offset is 2*384/64 = 12 // Offset is 2*384/64 = 12
const uint8_t lut_idxCnProcG8[8][7] = {{12,24,36,48,56,72,84}, {0,24,36,48,56,72,84}, /* const uint8_t lut_idxCnProcG8[8][7] = {{12,24,36,48,56,72,84}, {0,24,36,48,56,72,84},
{0,12,36,48,56,72,84}, {0,12,24,48,56,72,84}, {0,12,36,48,56,72,84}, {0,12,24,48,56,72,84},
{0,12,24,36,56,72,84}, {0,12,24,36,48,72,84}, {0,12,24,36,56,72,84}, {0,12,24,36,48,72,84},
{0,12,24,36,48,56,84}, {0,12,24,36,48,120,72}}; {0,12,24,36,48,56,84}, {0,12,24,36,48,120,72}};*/
const uint8_t lut_idxCnProcG8[8][7] = {{24,48,72,96,120,144,168}, {0,48,72,96,120,144,168},
{0,24,72,96,120,144,168}, {0,24,48,96,120,144,168},
{0,24,48,72,120,144,168}, {0,24,48,72,96,144,168},
{0,24,48,72,96,120,168}, {0,24,48,72,96,120,144}};
...@@ -430,10 +553,17 @@ void nrLDPC_cnProc_BG1_generator_AVX512(uint16_t Z,int R) ...@@ -430,10 +553,17 @@ void nrLDPC_cnProc_BG1_generator_AVX512(uint16_t Z,int R)
{ {
// Number of groups of 64 CNs for parallel processing // Number of groups of 64 CNs for parallel processing
// Ceil for values not divisible by 64 // Ceil for values not divisible by 64
<<<<<<< HEAD
M = (lut_numCnInCnGroups[5]*Z + 63)>>31; M = (lut_numCnInCnGroups[5]*Z + 63)>>31;
// Set the offset to each bit within a group in terms of 64 Byte // Set the offset to each bit within a group in terms of 64 Byte
bitOffsetInGroup = (lut_numCnInCnGroups_BG1_R13[5]*NR_LDPC_ZMAX)>>31; bitOffsetInGroup = (lut_numCnInCnGroups_BG1_R13[5]*NR_LDPC_ZMAX)>>31;
=======
M = (lut_numCnInCnGroups[5]*Z + 31)>>7;
// Set the offset to each bit within a group in terms of 64 Byte
bitOffsetInGroup = (lut_numCnInCnGroups_BG1_R13[5]*NR_LDPC_ZMAX)>>7;
>>>>>>> 6d9ceaa712033dc4f28050886be43572df7c2b68
// Set pointers to start of group 4 // Set pointers to start of group 4
...@@ -453,7 +583,11 @@ void nrLDPC_cnProc_BG1_generator_AVX512(uint16_t Z,int R) ...@@ -453,7 +583,11 @@ void nrLDPC_cnProc_BG1_generator_AVX512(uint16_t Z,int R)
fprintf(fd," for (int i=0;i<%d;i++) {\n",M); fprintf(fd," for (int i=0;i<%d;i++) {\n",M);
// Abs and sign of 64 CNs (first BN) // Abs and sign of 64 CNs (first BN)
// zmm0 = p_cnProcBuf[lut_idxCnProcG3[j][0] + i]; // zmm0 = p_cnProcBuf[lut_idxCnProcG3[j][0] + i];
<<<<<<< HEAD
fprintf(fd," zmm0 = ((__m512i*)cnProcBuf)[%d+i];\n",(lut_startAddrCnGroups[5]>>31)+lut_idxCnProcG8[j][0]); fprintf(fd," zmm0 = ((__m512i*)cnProcBuf)[%d+i];\n",(lut_startAddrCnGroups[5]>>31)+lut_idxCnProcG8[j][0]);
=======
fprintf(fd," zmm0 = ((__m512i*)cnProcBuf)[%d+i];\n",(lut_startAddrCnGroups[5]>>7)+lut_idxCnProcG8[j][0]);
>>>>>>> 6d9ceaa712033dc4f28050886be43572df7c2b68
// sgn = _mm512_sign_epi16(ones, zmm0); // sgn = _mm512_sign_epi16(ones, zmm0);
fprintf(fd," sgn = _mm512_sign_epi16(ones, zmm0);\n"); fprintf(fd," sgn = _mm512_sign_epi16(ones, zmm0);\n");
// min = _mm512_abs_epi8(zmm0); // min = _mm512_abs_epi8(zmm0);
...@@ -463,7 +597,11 @@ void nrLDPC_cnProc_BG1_generator_AVX512(uint16_t Z,int R) ...@@ -463,7 +597,11 @@ void nrLDPC_cnProc_BG1_generator_AVX512(uint16_t Z,int R)
// Loop over BNs // Loop over BNs
for (k=1; k<7; k++) for (k=1; k<7; k++)
{ {
<<<<<<< HEAD
fprintf(fd," zmm0 = ((__m512i*)cnProcBuf)[%d+i];\n",(lut_startAddrCnGroups[5]>>31)+lut_idxCnProcG8[j][k]); fprintf(fd," zmm0 = ((__m512i*)cnProcBuf)[%d+i];\n",(lut_startAddrCnGroups[5]>>31)+lut_idxCnProcG8[j][k]);
=======
fprintf(fd," zmm0 = ((__m512i*)cnProcBuf)[%d+i];\n",(lut_startAddrCnGroups[5]>>7)+lut_idxCnProcG8[j][k]);
>>>>>>> 6d9ceaa712033dc4f28050886be43572df7c2b68
// min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); // min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0));
fprintf(fd," min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0));\n"); fprintf(fd," min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0));\n");
...@@ -477,7 +615,11 @@ void nrLDPC_cnProc_BG1_generator_AVX512(uint16_t Z,int R) ...@@ -477,7 +615,11 @@ void nrLDPC_cnProc_BG1_generator_AVX512(uint16_t Z,int R)
fprintf(fd," min = _mm512_min_epu8(min, maxLLR);\n"); fprintf(fd," min = _mm512_min_epu8(min, maxLLR);\n");
// *p_cnProcBufResBit = _mm512_sign_epi16(min, sgn); // *p_cnProcBufResBit = _mm512_sign_epi16(min, sgn);
// p_cnProcBufResBit++; // p_cnProcBufResBit++;
<<<<<<< HEAD
fprintf(fd," ((__m512i*)cnProcBufRes)[%d+i] = _mm512_sign_epi16(min, sgn);\n",(lut_startAddrCnGroups[5]>>31)+(j*bitOffsetInGroup)); fprintf(fd," ((__m512i*)cnProcBufRes)[%d+i] = _mm512_sign_epi16(min, sgn);\n",(lut_startAddrCnGroups[5]>>31)+(j*bitOffsetInGroup));
=======
fprintf(fd," ((__m512i*)cnProcBufRes)[%d+i] = _mm512_sign_epi16(min, sgn);\n",(lut_startAddrCnGroups[5]>>7)+(j*bitOffsetInGroup));
>>>>>>> 6d9ceaa712033dc4f28050886be43572df7c2b68
fprintf(fd," }\n"); fprintf(fd," }\n");
} }
} }
...@@ -486,23 +628,34 @@ void nrLDPC_cnProc_BG1_generator_AVX512(uint16_t Z,int R) ...@@ -486,23 +628,34 @@ void nrLDPC_cnProc_BG1_generator_AVX512(uint16_t Z,int R)
// Process group with 9 BNs // Process group with 9 BNs
fprintf(fd,"//Process group with 9 BNs\n"); fprintf(fd,"//Process group with 9 BNs\n");
// Offset is 2*384/64 = 12 // Offset is 2*384/64 = 12
const uint8_t lut_idxCnProcG9[9][8] = {{12,24,36,48,60,72,84,96}, {0,24,36,48,60,72,84,96}, /*const uint8_t lut_idxCnProcG9[9][8] = {{12,24,36,48,60,72,84,96}, {0,24,36,48,60,72,84,96},
{0,12,36,48,60,72,84,96}, {0,12,24,48,60,72,84,96}, {0,12,36,48,60,72,84,96}, {0,12,24,48,60,72,84,96},
{0,12,24,36,60,72,84,96}, {0,12,24,36,48,72,84,96}, {0,12,24,36,60,72,84,96}, {0,12,24,36,48,72,84,96},
{0,12,24,36,48,60,84,96}, {0,12,24,36,48,60,72,96}, {0,12,24,36,48,60,84,96}, {0,12,24,36,48,60,72,96},
{0,12,24,36,48,60,72,84}}; {0,12,24,36,48,60,72,84}};*/
const uint8_t lut_idxCnProcG9[9][8] = {{24,48,72,96,120,144,168,192}, {0,48,72,96,120,144,168,192},
{0,24,72,96,120,144,168,192}, {0,24,48,96,120,144,168,192},
{0,24,48,72,120,144,168,192}, {0,24,48,72,96,144,168,192},
{0,24,48,72,96,120,168,192}, {0,24,48,72,96,120,144,192},
{0,24,48,72,96,120,144,168}};
if (lut_numCnInCnGroups[6] > 0) if (lut_numCnInCnGroups[6] > 0)
{ {
// Number of groups of 64 CNs for parallel processing // Number of groups of 64 CNs for parallel processing
// Ceil for values not divisible by 64 // Ceil for values not divisible by 64
<<<<<<< HEAD
M = (lut_numCnInCnGroups[6]*Z + 63)>>31; M = (lut_numCnInCnGroups[6]*Z + 63)>>31;
// Set the offset to each bit within a group in terms of 64 Byte // Set the offset to each bit within a group in terms of 64 Byte
bitOffsetInGroup = (lut_numCnInCnGroups_BG1_R13[6]*NR_LDPC_ZMAX)>>31; bitOffsetInGroup = (lut_numCnInCnGroups_BG1_R13[6]*NR_LDPC_ZMAX)>>31;
=======
M = (lut_numCnInCnGroups[6]*Z + 31)>>7;
// Set the offset to each bit within a group in terms of 64 Byte
bitOffsetInGroup = (lut_numCnInCnGroups_BG1_R13[6]*NR_LDPC_ZMAX)>>7;
>>>>>>> 6d9ceaa712033dc4f28050886be43572df7c2b68
// Set pointers to start of group 9 // Set pointers to start of group 9
...@@ -522,7 +675,11 @@ void nrLDPC_cnProc_BG1_generator_AVX512(uint16_t Z,int R) ...@@ -522,7 +675,11 @@ void nrLDPC_cnProc_BG1_generator_AVX512(uint16_t Z,int R)
fprintf(fd," for (int i=0;i<%d;i++) {\n",M); fprintf(fd," for (int i=0;i<%d;i++) {\n",M);
// Abs and sign of 64 CNs (first BN) // Abs and sign of 64 CNs (first BN)
// zmm0 = p_cnProcBuf[lut_idxCnProcG3[j][0] + i]; // zmm0 = p_cnProcBuf[lut_idxCnProcG3[j][0] + i];
<<<<<<< HEAD
fprintf(fd," zmm0 = ((__m512i*)cnProcBuf)[%d+i];\n",(lut_startAddrCnGroups[6]>>31)+lut_idxCnProcG9[j][0]); fprintf(fd," zmm0 = ((__m512i*)cnProcBuf)[%d+i];\n",(lut_startAddrCnGroups[6]>>31)+lut_idxCnProcG9[j][0]);
=======
fprintf(fd," zmm0 = ((__m512i*)cnProcBuf)[%d+i];\n",(lut_startAddrCnGroups[6]>>7)+lut_idxCnProcG9[j][0]);
>>>>>>> 6d9ceaa712033dc4f28050886be43572df7c2b68
// sgn = _mm512_sign_epi16(ones, zmm0); // sgn = _mm512_sign_epi16(ones, zmm0);
fprintf(fd," sgn = _mm512_sign_epi16(ones, zmm0);\n"); fprintf(fd," sgn = _mm512_sign_epi16(ones, zmm0);\n");
// min = _mm512_abs_epi8(zmm0); // min = _mm512_abs_epi8(zmm0);
...@@ -532,7 +689,11 @@ void nrLDPC_cnProc_BG1_generator_AVX512(uint16_t Z,int R) ...@@ -532,7 +689,11 @@ void nrLDPC_cnProc_BG1_generator_AVX512(uint16_t Z,int R)
// Loop over BNs // Loop over BNs
for (k=1; k<8; k++) for (k=1; k<8; k++)
{ {
<<<<<<< HEAD
fprintf(fd," zmm0 = ((__m512i*)cnProcBuf)[%d+i];\n",(lut_startAddrCnGroups[6]>>31)+lut_idxCnProcG9[j][k]); fprintf(fd," zmm0 = ((__m512i*)cnProcBuf)[%d+i];\n",(lut_startAddrCnGroups[6]>>31)+lut_idxCnProcG9[j][k]);
=======
fprintf(fd," zmm0 = ((__m512i*)cnProcBuf)[%d+i];\n",(lut_startAddrCnGroups[6]>>7)+lut_idxCnProcG9[j][k]);
>>>>>>> 6d9ceaa712033dc4f28050886be43572df7c2b68
// min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); // min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0));
fprintf(fd," min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0));\n"); fprintf(fd," min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0));\n");
...@@ -546,7 +707,11 @@ void nrLDPC_cnProc_BG1_generator_AVX512(uint16_t Z,int R) ...@@ -546,7 +707,11 @@ void nrLDPC_cnProc_BG1_generator_AVX512(uint16_t Z,int R)
fprintf(fd," min = _mm512_min_epu8(min, maxLLR);\n"); fprintf(fd," min = _mm512_min_epu8(min, maxLLR);\n");
// *p_cnProcBufResBit = _mm512_sign_epi16(min, sgn); // *p_cnProcBufResBit = _mm512_sign_epi16(min, sgn);
// p_cnProcBufResBit++; // p_cnProcBufResBit++;
<<<<<<< HEAD
fprintf(fd," ((__m512i*)cnProcBufRes)[%d+i] = _mm512_sign_epi16(min, sgn);\n",(lut_startAddrCnGroups[6]>>31)+(j*bitOffsetInGroup)); fprintf(fd," ((__m512i*)cnProcBufRes)[%d+i] = _mm512_sign_epi16(min, sgn);\n",(lut_startAddrCnGroups[6]>>31)+(j*bitOffsetInGroup));
=======
fprintf(fd," ((__m512i*)cnProcBufRes)[%d+i] = _mm512_sign_epi16(min, sgn);\n",(lut_startAddrCnGroups[6]>>7)+(j*bitOffsetInGroup));
>>>>>>> 6d9ceaa712033dc4f28050886be43572df7c2b68
fprintf(fd," }\n"); fprintf(fd," }\n");
} }
} }
...@@ -555,11 +720,17 @@ void nrLDPC_cnProc_BG1_generator_AVX512(uint16_t Z,int R) ...@@ -555,11 +720,17 @@ void nrLDPC_cnProc_BG1_generator_AVX512(uint16_t Z,int R)
// Process group with 10 BNs // Process group with 10 BNs
fprintf(fd,"//Process group with 10 BNs\n"); fprintf(fd,"//Process group with 10 BNs\n");
// Offset is 1*384/64 = 6 // Offset is 1*384/64 = 6
const uint8_t lut_idxCnProcG10[10][9] = {{6,12,18,24,30,36,42,48,54}, {0,12,18,24,30,36,42,48,54}, /* const uint8_t lut_idxCnProcG10[10][9] = {{6,12,18,24,30,36,42,48,54}, {0,12,18,24,30,36,42,48,54},
{0,6,18,24,30,36,42,48,54}, {0,6,12,24,30,36,42,48,54}, {0,6,18,24,30,36,42,48,54}, {0,6,12,24,30,36,42,48,54},
{0,6,12,18,30,36,42,48,54}, {0,6,12,18,24,36,42,48,54}, {0,6,12,18,30,36,42,48,54}, {0,6,12,18,24,36,42,48,54},
{0,6,12,18,24,30,42,48,54}, {0,6,12,18,24,30,36,48,54}, {0,6,12,18,24,30,42,48,54}, {0,6,12,18,24,30,36,48,54},
{0,6,12,18,24,30,36,42,54}, {0,6,12,36,24,30,36,42,48}}; {0,6,12,18,24,30,36,42,54}, {0,6,12,36,24,30,36,42,48}};*/
const uint8_t lut_idxCnProcG10[10][9] = {{12,24,36,48,60,72,84,96,108}, {0,24,36,48,60,72,84,96,108},
{0,12,36,48,60,72,84,96,108}, {0,12,24,48,60,72,84,96,108},
{0,12,24,36,60,72,84,96,108}, {0,12,24,36,48,72,84,96,108},
{0,12,24,36,48,60,84,96,108}, {0,12,24,36,48,60,72,96,108},
{0,12,24,36,48,60,72,84,108}, {0,12,24,36,48,60,72,84,96}};
...@@ -569,10 +740,17 @@ void nrLDPC_cnProc_BG1_generator_AVX512(uint16_t Z,int R) ...@@ -569,10 +740,17 @@ void nrLDPC_cnProc_BG1_generator_AVX512(uint16_t Z,int R)
{ {
// Number of groups of 64 CNs for parallel processing // Number of groups of 64 CNs for parallel processing
// Ceil for values not divisible by 64 // Ceil for values not divisible by 64
<<<<<<< HEAD
M = (lut_numCnInCnGroups[7]*Z + 63)>>31; M = (lut_numCnInCnGroups[7]*Z + 63)>>31;
// Set the offset to each bit within a group in terms of 64 Byte // Set the offset to each bit within a group in terms of 64 Byte
bitOffsetInGroup = (lut_numCnInCnGroups_BG1_R13[7]*NR_LDPC_ZMAX)>>31; bitOffsetInGroup = (lut_numCnInCnGroups_BG1_R13[7]*NR_LDPC_ZMAX)>>31;
=======
M = (lut_numCnInCnGroups[7]*Z + 31)>>7;
// Set the offset to each bit within a group in terms of 64 Byte
bitOffsetInGroup = (lut_numCnInCnGroups_BG1_R13[7]*NR_LDPC_ZMAX)>>7;
>>>>>>> 6d9ceaa712033dc4f28050886be43572df7c2b68
// Set pointers to start of group 10 // Set pointers to start of group 10
...@@ -592,7 +770,11 @@ void nrLDPC_cnProc_BG1_generator_AVX512(uint16_t Z,int R) ...@@ -592,7 +770,11 @@ void nrLDPC_cnProc_BG1_generator_AVX512(uint16_t Z,int R)
fprintf(fd," for (int i=0;i<%d;i++) {\n",M); fprintf(fd," for (int i=0;i<%d;i++) {\n",M);
// Abs and sign of 64 CNs (first BN) // Abs and sign of 64 CNs (first BN)
// zmm0 = p_cnProcBuf[lut_idxCnProcG3[j][0] + i]; // zmm0 = p_cnProcBuf[lut_idxCnProcG3[j][0] + i];
<<<<<<< HEAD
fprintf(fd," zmm0 = ((__m512i*)cnProcBuf)[%d+i];\n",(lut_startAddrCnGroups[7]>>31)+lut_idxCnProcG10[j][0]); fprintf(fd," zmm0 = ((__m512i*)cnProcBuf)[%d+i];\n",(lut_startAddrCnGroups[7]>>31)+lut_idxCnProcG10[j][0]);
=======
fprintf(fd," zmm0 = ((__m512i*)cnProcBuf)[%d+i];\n",(lut_startAddrCnGroups[7]>>7)+lut_idxCnProcG10[j][0]);
>>>>>>> 6d9ceaa712033dc4f28050886be43572df7c2b68
// sgn = _mm512_sign_epi16(ones, zmm0); // sgn = _mm512_sign_epi16(ones, zmm0);
fprintf(fd," sgn = _mm512_sign_epi16(ones, zmm0);\n"); fprintf(fd," sgn = _mm512_sign_epi16(ones, zmm0);\n");
// min = _mm512_abs_epi8(zmm0); // min = _mm512_abs_epi8(zmm0);
...@@ -602,7 +784,11 @@ void nrLDPC_cnProc_BG1_generator_AVX512(uint16_t Z,int R) ...@@ -602,7 +784,11 @@ void nrLDPC_cnProc_BG1_generator_AVX512(uint16_t Z,int R)
// Loop over BNs // Loop over BNs
for (k=1; k<9; k++) for (k=1; k<9; k++)
{ {
<<<<<<< HEAD
fprintf(fd," zmm0 = ((__m512i*)cnProcBuf)[%d+i];\n",(lut_startAddrCnGroups[7]>>31)+lut_idxCnProcG10[j][k]); fprintf(fd," zmm0 = ((__m512i*)cnProcBuf)[%d+i];\n",(lut_startAddrCnGroups[7]>>31)+lut_idxCnProcG10[j][k]);
=======
fprintf(fd," zmm0 = ((__m512i*)cnProcBuf)[%d+i];\n",(lut_startAddrCnGroups[7]>>7)+lut_idxCnProcG10[j][k]);
>>>>>>> 6d9ceaa712033dc4f28050886be43572df7c2b68
// min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); // min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0));
fprintf(fd," min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0));\n"); fprintf(fd," min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0));\n");
...@@ -616,7 +802,11 @@ void nrLDPC_cnProc_BG1_generator_AVX512(uint16_t Z,int R) ...@@ -616,7 +802,11 @@ void nrLDPC_cnProc_BG1_generator_AVX512(uint16_t Z,int R)
fprintf(fd," min = _mm512_min_epu8(min, maxLLR);\n"); fprintf(fd," min = _mm512_min_epu8(min, maxLLR);\n");
// *p_cnProcBufResBit = _mm512_sign_epi16(min, sgn); // *p_cnProcBufResBit = _mm512_sign_epi16(min, sgn);
// p_cnProcBufResBit++; // p_cnProcBufResBit++;
<<<<<<< HEAD
fprintf(fd," ((__m512i*)cnProcBufRes)[%d+i] = _mm512_sign_epi16(min, sgn);\n",(lut_startAddrCnGroups[7]>>31)+(j*bitOffsetInGroup)); fprintf(fd," ((__m512i*)cnProcBufRes)[%d+i] = _mm512_sign_epi16(min, sgn);\n",(lut_startAddrCnGroups[7]>>31)+(j*bitOffsetInGroup));
=======
fprintf(fd," ((__m512i*)cnProcBufRes)[%d+i] = _mm512_sign_epi16(min, sgn);\n",(lut_startAddrCnGroups[7]>>7)+(j*bitOffsetInGroup));
>>>>>>> 6d9ceaa712033dc4f28050886be43572df7c2b68
fprintf(fd," }\n"); fprintf(fd," }\n");
} }
} }
...@@ -626,7 +816,7 @@ void nrLDPC_cnProc_BG1_generator_AVX512(uint16_t Z,int R) ...@@ -626,7 +816,7 @@ void nrLDPC_cnProc_BG1_generator_AVX512(uint16_t Z,int R)
// Process group with 19 BNs // Process group with 19 BNs
fprintf(fd,"//Process group with 19 BNs\n"); fprintf(fd,"//Process group with 19 BNs\n");
// Offset is 4*384/64 = 24 // Offset is 4*384/64 = 24
const uint16_t lut_idxCnProcG19[19][18] = {{24,48,72,96,120,144,168,192,216,240,264,288,312,336,360,384,408,432}, {0,48,72,96,120,144,168,192,216,240,264,288,312,336,360,384,408,432}, /* const uint16_t lut_idxCnProcG19[19][18] = {{24,48,72,96,120,144,168,192,216,240,264,288,312,336,360,384,408,432}, {0,48,72,96,120,144,168,192,216,240,264,288,312,336,360,384,408,432},
{0,24,72,96,120,144,168,192,216,240,264,288,312,336,360,384,408,432}, {0,24,48,96,120,144,168,192,216,240,264,288,312,336,360,384,408,432}, {0,24,72,96,120,144,168,192,216,240,264,288,312,336,360,384,408,432}, {0,24,48,96,120,144,168,192,216,240,264,288,312,336,360,384,408,432},
{0,24,48,72,120,144,168,192,216,240,264,288,312,336,360,384,408,432}, {0,24,48,72,96,144,168,192,216,240,264,288,312,336,360,384,408,432}, {0,24,48,72,120,144,168,192,216,240,264,288,312,336,360,384,408,432}, {0,24,48,72,96,144,168,192,216,240,264,288,312,336,360,384,408,432},
{0,24,48,72,96,120,168,192,216,240,264,288,312,336,360,384,408,432}, {0,24,48,72,96,120,144,192,216,240,264,288,312,336,360,384,408,432}, {0,24,48,72,96,120,168,192,216,240,264,288,312,336,360,384,408,432}, {0,24,48,72,96,120,144,192,216,240,264,288,312,336,360,384,408,432},
...@@ -635,17 +825,35 @@ void nrLDPC_cnProc_BG1_generator_AVX512(uint16_t Z,int R) ...@@ -635,17 +825,35 @@ void nrLDPC_cnProc_BG1_generator_AVX512(uint16_t Z,int R)
{0,24,48,72,96,120,144,168,192,216,240,264,312,336,360,384,408,432}, {0,24,48,72,96,120,144,168,192,216,240,264,288,336,360,384,408,432}, {0,24,48,72,96,120,144,168,192,216,240,264,312,336,360,384,408,432}, {0,24,48,72,96,120,144,168,192,216,240,264,288,336,360,384,408,432},
{0,24,48,72,96,120,144,168,192,216,240,264,288,312,360,384,408,432}, {0,24,48,72,96,120,144,168,192,216,240,264,288,312,336,384,408,432}, {0,24,48,72,96,120,144,168,192,216,240,264,288,312,360,384,408,432}, {0,24,48,72,96,120,144,168,192,216,240,264,288,312,336,384,408,432},
{0,24,48,72,96,120,144,168,192,216,240,264,288,312,336,360,408,432}, {0,24,48,72,96,120,144,168,192,216,240,264,288,312,336,360,384,432}, {0,24,48,72,96,120,144,168,192,216,240,264,288,312,336,360,408,432}, {0,24,48,72,96,120,144,168,192,216,240,264,288,312,336,360,384,432},
{0,24,48,72,96,120,144,168,192,216,240,264,288,312,336,360,384,408}}; {0,24,48,72,96,120,144,168,192,216,240,264,288,312,336,360,384,408}};*/
const uint16_t lut_idxCnProcG19[19][18] = {{48,96,144,192,240,288,336,384,432,480,528,576,624,672,720,768,816,864}, {0,96,144,192,240,288,336,384,432,480,528,576,624,672,720,768,816,864},
{0,48,144,192,240,288,336,384,432,480,528,576,624,672,720,768,816,864}, {0,48,96,192,240,288,336,384,432,480,528,576,624,672,720,768,816,864},
{0,48,96,144,240,288,336,384,432,480,528,576,624,672,720,768,816,864}, {0,48,96,144,192,288,336,384,432,480,528,576,624,672,720,768,816,864},
{0,48,96,144,192,240,336,384,432,480,528,576,624,672,720,768,816,864}, {0,48,96,144,192,240,288,384,432,480,528,576,624,672,720,768,816,864},
{0,48,96,144,192,240,288,336,432,480,528,576,624,672,720,768,816,864}, {0,48,96,144,192,240,288,336,384,480,528,576,624,672,720,768,816,864},
{0,48,96,144,192,240,288,336,384,432,528,576,624,672,720,768,816,864}, {0,48,96,144,192,240,288,336,384,432,480,576,624,672,720,768,816,864},
{0,48,96,144,192,240,288,336,384,432,480,528,624,672,720,768,816,864}, {0,48,96,144,192,240,288,336,384,432,480,528,576,672,720,768,816,864},
{0,48,96,144,192,240,288,336,384,432,480,528,576,624,720,768,816,864}, {0,48,96,144,192,240,288,336,384,432,480,528,576,624,672,768,816,864},
{0,48,96,144,192,240,288,336,384,432,480,528,576,624,672,720,816,864}, {0,48,96,144,192,240,288,336,384,432,480,528,576,624,672,720,768,864},
{0,48,96,144,192,240,288,336,384,432,480,528,576,624,672,720,768,816}};
if (lut_numCnInCnGroups[8] > 0) if (lut_numCnInCnGroups[8] > 0)
{ {
// Number of groups of 64 CNs for parallel processing // Number of groups of 64 CNs for parallel processing
// Ceil for values not divisible by 64 // Ceil for values not divisible by 64
<<<<<<< HEAD
M = (lut_numCnInCnGroups[8]*Z + 63)>>31; M = (lut_numCnInCnGroups[8]*Z + 63)>>31;
// Set the offset to each bit within a group in terms of 64 Byte // Set the offset to each bit within a group in terms of 64 Byte
bitOffsetInGroup = (lut_numCnInCnGroups_BG1_R13[8]*NR_LDPC_ZMAX)>>31; bitOffsetInGroup = (lut_numCnInCnGroups_BG1_R13[8]*NR_LDPC_ZMAX)>>31;
=======
M = (lut_numCnInCnGroups[8]*Z + 31)>>7;
// Set the offset to each bit within a group in terms of 64 Byte
bitOffsetInGroup = (lut_numCnInCnGroups_BG1_R13[8]*NR_LDPC_ZMAX)>>7;
>>>>>>> 6d9ceaa712033dc4f28050886be43572df7c2b68
// Set pointers to start of group 19 // Set pointers to start of group 19
...@@ -665,7 +873,11 @@ void nrLDPC_cnProc_BG1_generator_AVX512(uint16_t Z,int R) ...@@ -665,7 +873,11 @@ void nrLDPC_cnProc_BG1_generator_AVX512(uint16_t Z,int R)
fprintf(fd," for (int i=0;i<%d;i++) {\n",M); fprintf(fd," for (int i=0;i<%d;i++) {\n",M);
// Abs and sign of 64 CNs (first BN) // Abs and sign of 64 CNs (first BN)
// zmm0 = p_cnProcBuf[lut_idxCnProcG3[j][0] + i]; // zmm0 = p_cnProcBuf[lut_idxCnProcG3[j][0] + i];
<<<<<<< HEAD
fprintf(fd," zmm0 = ((__m512i*)cnProcBuf)[%d+i];\n",(lut_startAddrCnGroups[8]>>31)+lut_idxCnProcG19[j][0]); fprintf(fd," zmm0 = ((__m512i*)cnProcBuf)[%d+i];\n",(lut_startAddrCnGroups[8]>>31)+lut_idxCnProcG19[j][0]);
=======
fprintf(fd," zmm0 = ((__m512i*)cnProcBuf)[%d+i];\n",(lut_startAddrCnGroups[8]>>7)+lut_idxCnProcG19[j][0]);
>>>>>>> 6d9ceaa712033dc4f28050886be43572df7c2b68
// sgn = _mm512_sign_epi16(ones, zmm0); // sgn = _mm512_sign_epi16(ones, zmm0);
fprintf(fd," sgn = _mm512_sign_epi16(ones, zmm0);\n"); fprintf(fd," sgn = _mm512_sign_epi16(ones, zmm0);\n");
// min = _mm512_abs_epi8(zmm0); // min = _mm512_abs_epi8(zmm0);
...@@ -675,7 +887,11 @@ void nrLDPC_cnProc_BG1_generator_AVX512(uint16_t Z,int R) ...@@ -675,7 +887,11 @@ void nrLDPC_cnProc_BG1_generator_AVX512(uint16_t Z,int R)
// Loop over BNs // Loop over BNs
for (k=1; k<18; k++) for (k=1; k<18; k++)
{ {
<<<<<<< HEAD
fprintf(fd," zmm0 = ((__m512i*)cnProcBuf)[%d+i];\n",(lut_startAddrCnGroups[8]>>31)+lut_idxCnProcG19[j][k]); fprintf(fd," zmm0 = ((__m512i*)cnProcBuf)[%d+i];\n",(lut_startAddrCnGroups[8]>>31)+lut_idxCnProcG19[j][k]);
=======
fprintf(fd," zmm0 = ((__m512i*)cnProcBuf)[%d+i];\n",(lut_startAddrCnGroups[8]>>7)+lut_idxCnProcG19[j][k]);
>>>>>>> 6d9ceaa712033dc4f28050886be43572df7c2b68
// min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); // min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0));
fprintf(fd," min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0));\n"); fprintf(fd," min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0));\n");
...@@ -689,7 +905,11 @@ void nrLDPC_cnProc_BG1_generator_AVX512(uint16_t Z,int R) ...@@ -689,7 +905,11 @@ void nrLDPC_cnProc_BG1_generator_AVX512(uint16_t Z,int R)
fprintf(fd," min = _mm512_min_epu8(min, maxLLR);\n"); fprintf(fd," min = _mm512_min_epu8(min, maxLLR);\n");
// *p_cnProcBufResBit = _mm512_sign_epi16(min, sgn); // *p_cnProcBufResBit = _mm512_sign_epi16(min, sgn);
// p_cnProcBufResBit++; // p_cnProcBufResBit++;
<<<<<<< HEAD
fprintf(fd," ((__m512i*)cnProcBufRes)[%d+i] = _mm512_sign_epi16(min, sgn);\n",(lut_startAddrCnGroups[8]>>31)+(j*bitOffsetInGroup)); fprintf(fd," ((__m512i*)cnProcBufRes)[%d+i] = _mm512_sign_epi16(min, sgn);\n",(lut_startAddrCnGroups[8]>>31)+(j*bitOffsetInGroup));
=======
fprintf(fd," ((__m512i*)cnProcBufRes)[%d+i] = _mm512_sign_epi16(min, sgn);\n",(lut_startAddrCnGroups[8]>>7)+(j*bitOffsetInGroup));
>>>>>>> 6d9ceaa712033dc4f28050886be43572df7c2b68
fprintf(fd," }\n"); fprintf(fd," }\n");
} }
} }
......
...@@ -30,7 +30,7 @@ ...@@ -30,7 +30,7 @@
#ifndef __AVX512BW__ #ifndef __AVX512BW__
#pragma GCC push_options #pragma GCC push_options
//#pragma GCC target("avx512bw") #pragma GCC target("avx512bw")
#define __DISABLE_AVX512BW__ #define __DISABLE_AVX512BW__
#endif /* __AVX512BW__ */ #endif /* __AVX512BW__ */
......
...@@ -30,7 +30,7 @@ ...@@ -30,7 +30,7 @@
#ifndef __AVX512F__ #ifndef __AVX512F__
#pragma GCC push_options #pragma GCC push_options
//#pragma GCC target("avx512f") #pragma GCC target("avx512f")
#define __DISABLE_AVX512F__ #define __DISABLE_AVX512F__
#endif /* __AVX512F__ */ #endif /* __AVX512F__ */
......
...@@ -23,7 +23,7 @@ ...@@ -23,7 +23,7 @@
* \brief Defines the functions for check node processing * \brief Defines the functions for check node processing
* Version AVX512 * Version AVX512
*/ */
#include <immintrin.h>
#ifndef __NR_LDPC_BNPROC__H__ #ifndef __NR_LDPC_BNPROC__H__
#define __NR_LDPC_BNPROC__H__ #define __NR_LDPC_BNPROC__H__
...@@ -59,7 +59,7 @@ static inline void nrLDPC_bnProcPc(t_nrLDPC_lut* p_lut, t_nrLDPC_procBuf* p_proc ...@@ -59,7 +59,7 @@ static inline void nrLDPC_bnProcPc(t_nrLDPC_lut* p_lut, t_nrLDPC_procBuf* p_proc
uint32_t cnOffsetInGroup; uint32_t cnOffsetInGroup;
uint8_t idxBnGroup = 0; uint8_t idxBnGroup = 0;
__m512i zmm0, zmm1, zmmRes0, zmmRes1; __m512i zmm0, zmm1, zmmRes0, zmmRes1,tmp;
// ===================================================================== // =====================================================================
// Process group with 1 CN // Process group with 1 CN
...@@ -96,8 +96,8 @@ static inline void nrLDPC_bnProcPc(t_nrLDPC_lut* p_lut, t_nrLDPC_procBuf* p_proc ...@@ -96,8 +96,8 @@ static inline void nrLDPC_bnProcPc(t_nrLDPC_lut* p_lut, t_nrLDPC_procBuf* p_proc
// Pack results back to epi8 // Pack results back to epi8
zmm0 = _mm512_packs_epi16(zmmRes0, zmmRes1); zmm0 = _mm512_packs_epi16(zmmRes0, zmmRes1);
//-*p_llrRes = _mm512_permute4x64_epi64(zmm0, 0xD8); // revenir ? //-*p_llrRes = _mm512_permutevar_epi32 (tmp, zmm0); // revenir ?
*p_llrRes = _mm512_permutex2var_epi16(zmm0, 0xD8,zmm1); *p_llrRes = _mm512_permutex2var_epi16(zmm0,tmp,zmm1);
// Next result // Next result
p_llrRes++; p_llrRes++;
} }
...@@ -148,7 +148,7 @@ static inline void nrLDPC_bnProcPc(t_nrLDPC_lut* p_lut, t_nrLDPC_procBuf* p_proc ...@@ -148,7 +148,7 @@ static inline void nrLDPC_bnProcPc(t_nrLDPC_lut* p_lut, t_nrLDPC_procBuf* p_proc
// Pack results back to epi8 // Pack results back to epi8
zmm0 = _mm512_packs_epi16(zmmRes0, zmmRes1); zmm0 = _mm512_packs_epi16(zmmRes0, zmmRes1);
*p_llrRes = _mm512_permutex2var_epi16(zmm0, 0xD8,zmm1); *p_llrRes = _mm512_permutex2var_epi16(zmm0,tmp,zmm1);
// Next result // Next result
p_llrRes++; p_llrRes++;
...@@ -201,7 +201,7 @@ static inline void nrLDPC_bnProcPc(t_nrLDPC_lut* p_lut, t_nrLDPC_procBuf* p_proc ...@@ -201,7 +201,7 @@ static inline void nrLDPC_bnProcPc(t_nrLDPC_lut* p_lut, t_nrLDPC_procBuf* p_proc
// Pack results back to epi8 // Pack results back to epi8
zmm0 = _mm512_packs_epi16(zmmRes0, zmmRes1); zmm0 = _mm512_packs_epi16(zmmRes0, zmmRes1);
*p_llrRes = _mm512_permutex2var_epi16(zmm0, 0xD8,zmm1); *p_llrRes = _mm512_permutex2var_epi16(zmm0,tmp,zmm1);
// Next result // Next result
p_llrRes++; p_llrRes++;
...@@ -254,7 +254,7 @@ static inline void nrLDPC_bnProcPc(t_nrLDPC_lut* p_lut, t_nrLDPC_procBuf* p_proc ...@@ -254,7 +254,7 @@ static inline void nrLDPC_bnProcPc(t_nrLDPC_lut* p_lut, t_nrLDPC_procBuf* p_proc
// Pack results back to epi8 // Pack results back to epi8
zmm0 = _mm512_packs_epi16(zmmRes0, zmmRes1); zmm0 = _mm512_packs_epi16(zmmRes0, zmmRes1);
*p_llrRes = _mm512_permutex2var_epi16(zmm0, 0xD8,zmm1); *p_llrRes = _mm512_permutex2var_epi16(zmm0,tmp,zmm1);
// Next result // Next result
p_llrRes++; p_llrRes++;
...@@ -307,7 +307,7 @@ static inline void nrLDPC_bnProcPc(t_nrLDPC_lut* p_lut, t_nrLDPC_procBuf* p_proc ...@@ -307,7 +307,7 @@ static inline void nrLDPC_bnProcPc(t_nrLDPC_lut* p_lut, t_nrLDPC_procBuf* p_proc
// Pack results back to epi8 // Pack results back to epi8
zmm0 = _mm512_packs_epi16(zmmRes0, zmmRes1); zmm0 = _mm512_packs_epi16(zmmRes0, zmmRes1);
*p_llrRes = _mm512_permutex2var_epi16(zmm0, 0xD8,zmm1); *p_llrRes = _mm512_permutex2var_epi16(zmm0,tmp,zmm1);
// Next result // Next result
p_llrRes++; p_llrRes++;
...@@ -360,7 +360,7 @@ static inline void nrLDPC_bnProcPc(t_nrLDPC_lut* p_lut, t_nrLDPC_procBuf* p_proc ...@@ -360,7 +360,7 @@ static inline void nrLDPC_bnProcPc(t_nrLDPC_lut* p_lut, t_nrLDPC_procBuf* p_proc
// Pack results back to epi8 // Pack results back to epi8
zmm0 = _mm512_packs_epi16(zmmRes0, zmmRes1); zmm0 = _mm512_packs_epi16(zmmRes0, zmmRes1);
*p_llrRes = _mm512_permutex2var_epi16(zmm0, 0xD8,zmm1); *p_llrRes = _mm512_permutex2var_epi16(zmm0,tmp,zmm1);
// Next result // Next result
p_llrRes++; p_llrRes++;
...@@ -413,7 +413,7 @@ static inline void nrLDPC_bnProcPc(t_nrLDPC_lut* p_lut, t_nrLDPC_procBuf* p_proc ...@@ -413,7 +413,7 @@ static inline void nrLDPC_bnProcPc(t_nrLDPC_lut* p_lut, t_nrLDPC_procBuf* p_proc
// Pack results back to epi8 // Pack results back to epi8
zmm0 = _mm512_packs_epi16(zmmRes0, zmmRes1); zmm0 = _mm512_packs_epi16(zmmRes0, zmmRes1);
*p_llrRes = _mm512_permutex2var_epi16(zmm0, 0xD8,zmm1); *p_llrRes = _mm512_permutex2var_epi16(zmm0,tmp,zmm1);
// Next result // Next result
p_llrRes++; p_llrRes++;
...@@ -466,7 +466,7 @@ static inline void nrLDPC_bnProcPc(t_nrLDPC_lut* p_lut, t_nrLDPC_procBuf* p_proc ...@@ -466,7 +466,7 @@ static inline void nrLDPC_bnProcPc(t_nrLDPC_lut* p_lut, t_nrLDPC_procBuf* p_proc
// Pack results back to epi8 // Pack results back to epi8
zmm0 = _mm512_packs_epi16(zmmRes0, zmmRes1); zmm0 = _mm512_packs_epi16(zmmRes0, zmmRes1);
*p_llrRes = _mm512_permutex2var_epi16(zmm0, 0xD8,zmm1); *p_llrRes = _mm512_permutex2var_epi16(zmm0,tmp,zmm1);
// Next result // Next result
p_llrRes++; p_llrRes++;
...@@ -519,7 +519,7 @@ static inline void nrLDPC_bnProcPc(t_nrLDPC_lut* p_lut, t_nrLDPC_procBuf* p_proc ...@@ -519,7 +519,7 @@ static inline void nrLDPC_bnProcPc(t_nrLDPC_lut* p_lut, t_nrLDPC_procBuf* p_proc
// Pack results back to epi8 // Pack results back to epi8
zmm0 = _mm512_packs_epi16(zmmRes0, zmmRes1); zmm0 = _mm512_packs_epi16(zmmRes0, zmmRes1);
*p_llrRes = _mm512_permutex2var_epi16(zmm0, 0xD8,zmm1); *p_llrRes = _mm512_permutex2var_epi16(zmm0,tmp,zmm1);
// Next result // Next result
p_llrRes++; p_llrRes++;
...@@ -572,7 +572,7 @@ static inline void nrLDPC_bnProcPc(t_nrLDPC_lut* p_lut, t_nrLDPC_procBuf* p_proc ...@@ -572,7 +572,7 @@ static inline void nrLDPC_bnProcPc(t_nrLDPC_lut* p_lut, t_nrLDPC_procBuf* p_proc
// Pack results back to epi8 // Pack results back to epi8
zmm0 = _mm512_packs_epi16(zmmRes0, zmmRes1); zmm0 = _mm512_packs_epi16(zmmRes0, zmmRes1);
*p_llrRes = _mm512_permutex2var_epi16(zmm0, 0xD8,zmm1); *p_llrRes = _mm512_permutex2var_epi16(zmm0,tmp,zmm1);
// Next result // Next result
p_llrRes++; p_llrRes++;
...@@ -625,7 +625,7 @@ static inline void nrLDPC_bnProcPc(t_nrLDPC_lut* p_lut, t_nrLDPC_procBuf* p_proc ...@@ -625,7 +625,7 @@ static inline void nrLDPC_bnProcPc(t_nrLDPC_lut* p_lut, t_nrLDPC_procBuf* p_proc
// Pack results back to epi8 // Pack results back to epi8
zmm0 = _mm512_packs_epi16(zmmRes0, zmmRes1); zmm0 = _mm512_packs_epi16(zmmRes0, zmmRes1);
*p_llrRes = _mm512_permutex2var_epi16(zmm0, 0xD8,zmm1); *p_llrRes = _mm512_permutex2var_epi16(zmm0,tmp,zmm1);
// Next result // Next result
p_llrRes++; p_llrRes++;
...@@ -678,7 +678,7 @@ static inline void nrLDPC_bnProcPc(t_nrLDPC_lut* p_lut, t_nrLDPC_procBuf* p_proc ...@@ -678,7 +678,7 @@ static inline void nrLDPC_bnProcPc(t_nrLDPC_lut* p_lut, t_nrLDPC_procBuf* p_proc
// Pack results back to epi8 // Pack results back to epi8
zmm0 = _mm512_packs_epi16(zmmRes0, zmmRes1); zmm0 = _mm512_packs_epi16(zmmRes0, zmmRes1);
*p_llrRes = _mm512_permutex2var_epi16(zmm0, 0xD8,zmm1); *p_llrRes = _mm512_permutex2var_epi16(zmm0,tmp,zmm1);
// Next result // Next result
p_llrRes++; p_llrRes++;
...@@ -731,7 +731,7 @@ static inline void nrLDPC_bnProcPc(t_nrLDPC_lut* p_lut, t_nrLDPC_procBuf* p_proc ...@@ -731,7 +731,7 @@ static inline void nrLDPC_bnProcPc(t_nrLDPC_lut* p_lut, t_nrLDPC_procBuf* p_proc
// Pack results back to epi8 // Pack results back to epi8
zmm0 = _mm512_packs_epi16(zmmRes0, zmmRes1); zmm0 = _mm512_packs_epi16(zmmRes0, zmmRes1);
*p_llrRes = _mm512_permutex2var_epi16(zmm0, 0xD8,zmm1); *p_llrRes = _mm512_permutex2var_epi16(zmm0,tmp,zmm1);
// Next result // Next result
p_llrRes++; p_llrRes++;
...@@ -784,7 +784,7 @@ static inline void nrLDPC_bnProcPc(t_nrLDPC_lut* p_lut, t_nrLDPC_procBuf* p_proc ...@@ -784,7 +784,7 @@ static inline void nrLDPC_bnProcPc(t_nrLDPC_lut* p_lut, t_nrLDPC_procBuf* p_proc
// Pack results back to epi8 // Pack results back to epi8
zmm0 = _mm512_packs_epi16(zmmRes0, zmmRes1); zmm0 = _mm512_packs_epi16(zmmRes0, zmmRes1);
*p_llrRes = _mm512_permutex2var_epi16(zmm0, 0xD8,zmm1); *p_llrRes = _mm512_permutex2var_epi16(zmm0,tmp,zmm1);
// Next result // Next result
p_llrRes++; p_llrRes++;
...@@ -837,7 +837,7 @@ static inline void nrLDPC_bnProcPc(t_nrLDPC_lut* p_lut, t_nrLDPC_procBuf* p_proc ...@@ -837,7 +837,7 @@ static inline void nrLDPC_bnProcPc(t_nrLDPC_lut* p_lut, t_nrLDPC_procBuf* p_proc
// Pack results back to epi8 // Pack results back to epi8
zmm0 = _mm512_packs_epi16(zmmRes0, zmmRes1); zmm0 = _mm512_packs_epi16(zmmRes0, zmmRes1);
*p_llrRes = _mm512_permutex2var_epi16(zmm0, 0xD8,zmm1); *p_llrRes = _mm512_permutex2var_epi16(zmm0,tmp,zmm1);
// Next result // Next result
p_llrRes++; p_llrRes++;
...@@ -890,7 +890,7 @@ static inline void nrLDPC_bnProcPc(t_nrLDPC_lut* p_lut, t_nrLDPC_procBuf* p_proc ...@@ -890,7 +890,7 @@ static inline void nrLDPC_bnProcPc(t_nrLDPC_lut* p_lut, t_nrLDPC_procBuf* p_proc
// Pack results back to epi8 // Pack results back to epi8
zmm0 = _mm512_packs_epi16(zmmRes0, zmmRes1); zmm0 = _mm512_packs_epi16(zmmRes0, zmmRes1);
*p_llrRes = _mm512_permutex2var_epi16(zmm0, 0xD8,zmm1); *p_llrRes = _mm512_permutex2var_epi16(zmm0,tmp,zmm1);
// Next result // Next result
p_llrRes++; p_llrRes++;
...@@ -943,7 +943,7 @@ static inline void nrLDPC_bnProcPc(t_nrLDPC_lut* p_lut, t_nrLDPC_procBuf* p_proc ...@@ -943,7 +943,7 @@ static inline void nrLDPC_bnProcPc(t_nrLDPC_lut* p_lut, t_nrLDPC_procBuf* p_proc
// Pack results back to epi8 // Pack results back to epi8
zmm0 = _mm512_packs_epi16(zmmRes0, zmmRes1); zmm0 = _mm512_packs_epi16(zmmRes0, zmmRes1);
*p_llrRes = _mm512_permutex2var_epi16(zmm0, 0xD8,zmm1); *p_llrRes = _mm512_permutex2var_epi16(zmm0,tmp,zmm1);
// Next result // Next result
p_llrRes++; p_llrRes++;
...@@ -996,7 +996,7 @@ static inline void nrLDPC_bnProcPc(t_nrLDPC_lut* p_lut, t_nrLDPC_procBuf* p_proc ...@@ -996,7 +996,7 @@ static inline void nrLDPC_bnProcPc(t_nrLDPC_lut* p_lut, t_nrLDPC_procBuf* p_proc
// Pack results back to epi8 // Pack results back to epi8
zmm0 = _mm512_packs_epi16(zmmRes0, zmmRes1); zmm0 = _mm512_packs_epi16(zmmRes0, zmmRes1);
*p_llrRes = _mm512_permutex2var_epi16(zmm0, 0xD8,zmm1); *p_llrRes = _mm512_permutex2var_epi16(zmm0,tmp,zmm1);
// Next result // Next result
p_llrRes++; p_llrRes++;
...@@ -1049,7 +1049,7 @@ static inline void nrLDPC_bnProcPc(t_nrLDPC_lut* p_lut, t_nrLDPC_procBuf* p_proc ...@@ -1049,7 +1049,7 @@ static inline void nrLDPC_bnProcPc(t_nrLDPC_lut* p_lut, t_nrLDPC_procBuf* p_proc
// Pack results back to epi8 // Pack results back to epi8
zmm0 = _mm512_packs_epi16(zmmRes0, zmmRes1); zmm0 = _mm512_packs_epi16(zmmRes0, zmmRes1);
*p_llrRes = _mm512_permutex2var_epi16(zmm0, 0xD8,zmm1); *p_llrRes = _mm512_permutex2var_epi16(zmm0,tmp,zmm1);
// Next result // Next result
p_llrRes++; p_llrRes++;
...@@ -1102,7 +1102,7 @@ static inline void nrLDPC_bnProcPc(t_nrLDPC_lut* p_lut, t_nrLDPC_procBuf* p_proc ...@@ -1102,7 +1102,7 @@ static inline void nrLDPC_bnProcPc(t_nrLDPC_lut* p_lut, t_nrLDPC_procBuf* p_proc
// Pack results back to epi8 // Pack results back to epi8
zmm0 = _mm512_packs_epi16(zmmRes0, zmmRes1); zmm0 = _mm512_packs_epi16(zmmRes0, zmmRes1);
*p_llrRes = _mm512_permutex2var_epi16(zmm0, 0xD8,zmm1); *p_llrRes = _mm512_permutex2var_epi16(zmm0,tmp,zmm1);
// Next result // Next result
p_llrRes++; p_llrRes++;
...@@ -1155,7 +1155,7 @@ static inline void nrLDPC_bnProcPc(t_nrLDPC_lut* p_lut, t_nrLDPC_procBuf* p_proc ...@@ -1155,7 +1155,7 @@ static inline void nrLDPC_bnProcPc(t_nrLDPC_lut* p_lut, t_nrLDPC_procBuf* p_proc
// Pack results back to epi8 // Pack results back to epi8
zmm0 = _mm512_packs_epi16(zmmRes0, zmmRes1); zmm0 = _mm512_packs_epi16(zmmRes0, zmmRes1);
*p_llrRes = _mm512_permutex2var_epi16(zmm0, 0xD8,zmm1); *p_llrRes = _mm512_permutex2var_epi16(zmm0,tmp,zmm1);
// Next result // Next result
p_llrRes++; p_llrRes++;
...@@ -1208,7 +1208,7 @@ static inline void nrLDPC_bnProcPc(t_nrLDPC_lut* p_lut, t_nrLDPC_procBuf* p_proc ...@@ -1208,7 +1208,7 @@ static inline void nrLDPC_bnProcPc(t_nrLDPC_lut* p_lut, t_nrLDPC_procBuf* p_proc
// Pack results back to epi8 // Pack results back to epi8
zmm0 = _mm512_packs_epi16(zmmRes0, zmmRes1); zmm0 = _mm512_packs_epi16(zmmRes0, zmmRes1);
*p_llrRes = _mm512_permutex2var_epi16(zmm0, 0xD8,zmm1); *p_llrRes = _mm512_permutex2var_epi16(zmm0,tmp,zmm1);
// Next result // Next result
p_llrRes++; p_llrRes++;
...@@ -1261,7 +1261,7 @@ static inline void nrLDPC_bnProcPc(t_nrLDPC_lut* p_lut, t_nrLDPC_procBuf* p_proc ...@@ -1261,7 +1261,7 @@ static inline void nrLDPC_bnProcPc(t_nrLDPC_lut* p_lut, t_nrLDPC_procBuf* p_proc
// Pack results back to epi8 // Pack results back to epi8
zmm0 = _mm512_packs_epi16(zmmRes0, zmmRes1); zmm0 = _mm512_packs_epi16(zmmRes0, zmmRes1);
*p_llrRes = _mm512_permutex2var_epi16(zmm0, 0xD8,zmm1); *p_llrRes = _mm512_permutex2var_epi16(zmm0,tmp,zmm1);
// Next result // Next result
p_llrRes++; p_llrRes++;
...@@ -1314,7 +1314,7 @@ static inline void nrLDPC_bnProcPc(t_nrLDPC_lut* p_lut, t_nrLDPC_procBuf* p_proc ...@@ -1314,7 +1314,7 @@ static inline void nrLDPC_bnProcPc(t_nrLDPC_lut* p_lut, t_nrLDPC_procBuf* p_proc
// Pack results back to epi8 // Pack results back to epi8
zmm0 = _mm512_packs_epi16(zmmRes0, zmmRes1); zmm0 = _mm512_packs_epi16(zmmRes0, zmmRes1);
*p_llrRes = _mm512_permutex2var_epi16(zmm0, 0xD8,zmm1); *p_llrRes = _mm512_permutex2var_epi16(zmm0,tmp,zmm1);
// Next result // Next result
p_llrRes++; p_llrRes++;
...@@ -1367,7 +1367,7 @@ static inline void nrLDPC_bnProcPc(t_nrLDPC_lut* p_lut, t_nrLDPC_procBuf* p_proc ...@@ -1367,7 +1367,7 @@ static inline void nrLDPC_bnProcPc(t_nrLDPC_lut* p_lut, t_nrLDPC_procBuf* p_proc
// Pack results back to epi8 // Pack results back to epi8
zmm0 = _mm512_packs_epi16(zmmRes0, zmmRes1); zmm0 = _mm512_packs_epi16(zmmRes0, zmmRes1);
*p_llrRes = _mm512_permutex2var_epi16(zmm0, 0xD8,zmm1); *p_llrRes = _mm512_permutex2var_epi16(zmm0,tmp,zmm1);
// Next result // Next result
p_llrRes++; p_llrRes++;
...@@ -1420,7 +1420,7 @@ static inline void nrLDPC_bnProcPc(t_nrLDPC_lut* p_lut, t_nrLDPC_procBuf* p_proc ...@@ -1420,7 +1420,7 @@ static inline void nrLDPC_bnProcPc(t_nrLDPC_lut* p_lut, t_nrLDPC_procBuf* p_proc
// Pack results back to epi8 // Pack results back to epi8
zmm0 = _mm512_packs_epi16(zmmRes0, zmmRes1); zmm0 = _mm512_packs_epi16(zmmRes0, zmmRes1);
*p_llrRes = _mm512_permutex2var_epi16(zmm0, 0xD8,zmm1); *p_llrRes = _mm512_permutex2var_epi16(zmm0,tmp,zmm1);
// Next result // Next result
p_llrRes++; p_llrRes++;
...@@ -1473,7 +1473,7 @@ static inline void nrLDPC_bnProcPc(t_nrLDPC_lut* p_lut, t_nrLDPC_procBuf* p_proc ...@@ -1473,7 +1473,7 @@ static inline void nrLDPC_bnProcPc(t_nrLDPC_lut* p_lut, t_nrLDPC_procBuf* p_proc
// Pack results back to epi8 // Pack results back to epi8
zmm0 = _mm512_packs_epi16(zmmRes0, zmmRes1); zmm0 = _mm512_packs_epi16(zmmRes0, zmmRes1);
*p_llrRes = _mm512_permutex2var_epi16(zmm0, 0xD8,zmm1); *p_llrRes = _mm512_permutex2var_epi16(zmm0,tmp,zmm1);
// Next result // Next result
p_llrRes++; p_llrRes++;
...@@ -1526,7 +1526,7 @@ static inline void nrLDPC_bnProcPc(t_nrLDPC_lut* p_lut, t_nrLDPC_procBuf* p_proc ...@@ -1526,7 +1526,7 @@ static inline void nrLDPC_bnProcPc(t_nrLDPC_lut* p_lut, t_nrLDPC_procBuf* p_proc
// Pack results back to epi8 // Pack results back to epi8
zmm0 = _mm512_packs_epi16(zmmRes0, zmmRes1); zmm0 = _mm512_packs_epi16(zmmRes0, zmmRes1);
*p_llrRes = _mm512_permutex2var_epi16(zmm0, 0xD8,zmm1); *p_llrRes = _mm512_permutex2var_epi16(zmm0,tmp,zmm1);
// Next result // Next result
p_llrRes++; p_llrRes++;
...@@ -1571,16 +1571,16 @@ static inline void nrLDPC_bnProcPc(t_nrLDPC_lut* p_lut, t_nrLDPC_procBuf* p_proc ...@@ -1571,16 +1571,16 @@ static inline void nrLDPC_bnProcPc(t_nrLDPC_lut* p_lut, t_nrLDPC_procBuf* p_proc
// Add LLR from receiver input // Add LLR from receiver input
zmm0 = _mm512_cvtepi8_epi16(p_llrProcBuf[j]); zmm0 = _mm512_cvtepi8_epi16(p_llrProcBuf[j]);
zmmRes0 = _mm512_adds_epi16(ymmRes0, ymm0); zmmRes0 = _mm512_adds_epi16(zmmRes0, zmm0);
ymm1 = _mm512_cvtepi8_epi16(p_llrProcBuf[j+1]); zmm1 = _mm512_cvtepi8_epi16(p_llrProcBuf[j+1]);
ymmRes1 = _mm512_adds_epi16(ymmRes1, ymm1); zmmRes1 = _mm512_adds_epi16(zmmRes1, zmm1);
// Pack results back to epi8 // Pack results back to epi8
ymm0 = _mm512_packs_epi16(ymmRes0, ymmRes1); zmm0 = _mm512_packs_epi16(zmmRes0, zmmRes1);
// ymm0 = [ymmRes1[255:128] ymmRes0[255:128] ymmRes1[127:0] ymmRes0[127:0]] // zmm0 = [zmmRes1[255:128] zmmRes0[255:128] zmmRes1[127:0] zmmRes0[127:0]]
// p_llrRes = [ymmRes1[255:128] ymmRes1[127:0] ymmRes0[255:128] ymmRes0[127:0]] // p_llrRes = [zmmRes1[255:128] zmmRes1[127:0] zmmRes0[255:128] zmmRes0[127:0]]
*p_llrRes = _mm512_permute4x64_epi64(ymm0, 0xD8); *p_llrRes= _mm512_permutex2var_epi16(zmm0,tmp,zmm1);
// Next result // Next result
p_llrRes++; p_llrRes++;
...@@ -1610,31 +1610,31 @@ static inline void nrLDPC_bnProcPc(t_nrLDPC_lut* p_lut, t_nrLDPC_procBuf* p_proc ...@@ -1610,31 +1610,31 @@ static inline void nrLDPC_bnProcPc(t_nrLDPC_lut* p_lut, t_nrLDPC_procBuf* p_proc
for (i=0,j=0; i<M; i++,j+=2) for (i=0,j=0; i<M; i++,j+=2)
{ {
// First 16 LLRs of first CN // First 16 LLRs of first CN
ymmRes0 = _mm512_cvtepi8_epi16(p_bnProcBuf[j]); zmmRes0 = _mm512_cvtepi8_epi16(p_bnProcBuf[j]);
ymmRes1 = _mm512_cvtepi8_epi16(p_bnProcBuf[j+1]); zmmRes1 = _mm512_cvtepi8_epi16(p_bnProcBuf[j+1]);
// Loop over CNs // Loop over CNs
for (k=1; k<30; k++) for (k=1; k<30; k++)
{ {
ymm0 = _mm512_cvtepi8_epi16(p_bnProcBuf[k*cnOffsetInGroup + j]); zmm0 = _mm512_cvtepi8_epi16(p_bnProcBuf[k*cnOffsetInGroup + j]);
ymmRes0 = _mm512_adds_epi16(ymmRes0, ymm0); zmmRes0 = _mm512_adds_epi16(zmmRes0, zmm0);
ymm1 = _mm512_cvtepi8_epi16(p_bnProcBuf[k*cnOffsetInGroup + j+1]); zmm1 = _mm512_cvtepi8_epi16(p_bnProcBuf[k*cnOffsetInGroup + j+1]);
ymmRes1 = _mm512_adds_epi16(ymmRes1, ymm1); zmmRes1 = _mm512_adds_epi16(zmmRes1, zmm1);
} }
// Add LLR from receiver input // Add LLR from receiver input
ymm0 = _mm512_cvtepi8_epi16(p_llrProcBuf[j]); zmm0 = _mm512_cvtepi8_epi16(p_llrProcBuf[j]);
ymmRes0 = _mm512_adds_epi16(ymmRes0, ymm0); zmmRes0 = _mm512_adds_epi16(zmmRes0, zmm0);
ymm1 = _mm512_cvtepi8_epi16(p_llrProcBuf[j+1]); zmm1 = _mm512_cvtepi8_epi16(p_llrProcBuf[j+1]);
ymmRes1 = _mm512_adds_epi16(ymmRes1, ymm1); zmmRes1 = _mm512_adds_epi16(zmmRes1, zmm1);
// Pack results back to epi8 // Pack results back to epi8
ymm0 = _mm512_packs_epi16(ymmRes0, ymmRes1); zmm0 = _mm512_packs_epi16(zmmRes0, zmmRes1);
// ymm0 = [ymmRes1[255:128] ymmRes0[255:128] ymmRes1[127:0] ymmRes0[127:0]] // zmm0 = [zmmRes1[255:128] zmmRes0[255:128] zmmRes1[127:0] zmmRes0[127:0]]
// p_llrRes = [ymmRes1[255:128] ymmRes1[127:0] ymmRes0[255:128] ymmRes0[127:0]] // p_llrRes = [zmmRes1[255:128] zmmRes1[127:0] zmmRes0[255:128] zmmRes0[127:0]]
*p_llrRes = _mm512_permute4x64_epi64(ymm0, 0xD8); *p_llrRes = _mm512_permutex2var_epi16(zmm0,tmp,zmm1);
// Next result // Next result
p_llrRes++; p_llrRes++;
...@@ -2715,12 +2715,12 @@ static inline void nrLDPC_llr2bit(int8_t* out, int8_t* llrOut, uint16_t numLLR) ...@@ -2715,12 +2715,12 @@ static inline void nrLDPC_llr2bit(int8_t* out, int8_t* llrOut, uint16_t numLLR)
uint64_t M = numLLR>>5; uint64_t M = numLLR>>5;
uint64_t Mr = numLLR&63; uint64_t Mr = numLLR&63;
const __m512i* p_zeros = (__m512i*) zeros256_epi8; const __m512i* p_zeros = (__m512i*) zeros512_epi8;
const __m512i* p_ones = (__m512i*) ones256_epi8; const __m512i* p_ones = (__m512i*) ones512_epi8;
for (i=0; i<M; i++) for (i=0; i<M; i++)
{ {
*p_out++ = _mm512_and_pd(*p_ones, _mm512_cmpgt_epi8_mask(*p_zeros, *p_llrOut)); *p_out++ = _mm512_and_si512(*p_ones, _mm512_or_si512(*p_zeros, *p_llrOut));
p_llrOut++; p_llrOut++;
} }
...@@ -2738,6 +2738,7 @@ static inline void nrLDPC_llr2bit(int8_t* out, int8_t* llrOut, uint16_t numLLR) ...@@ -2738,6 +2738,7 @@ static inline void nrLDPC_llr2bit(int8_t* out, int8_t* llrOut, uint16_t numLLR)
} }
else else
{ {
p_out8[i] = 0; p_out8[i] = 0;
} }
} }
...@@ -2767,7 +2768,7 @@ static inline void nrLDPC_llr2bitPacked(int8_t* out, int8_t* llrOut, uint16_t nu ...@@ -2767,7 +2768,7 @@ static inline void nrLDPC_llr2bitPacked(int8_t* out, int8_t* llrOut, uint16_t nu
uint64_t i; uint64_t i;
uint64_t M = numLLR>>5; uint64_t M = numLLR>>5;
uint64_t Mr = numLLR&63; uint64_t Mr = numLLR&63;
const __m512i* p_shuffle = (__m512i*) constShuffle_256_epi8; const __m512i* p_shuffle = (__m512i*) constShuffle_512_epi8;
for (i=0; i<M; i++) for (i=0; i<M; i++)
{ {
......
...@@ -27,7 +27,9 @@ ...@@ -27,7 +27,9 @@
#ifndef __NR_LDPC_CNPROC__H__ #ifndef __NR_LDPC_CNPROC__H__
#define __NR_LDPC_CNPROC__H__ #define __NR_LDPC_CNPROC__H__
#include <immintrin.h> #include <immintrin.h>
#include "include/avx512fintrin.h" #include <avx512fintrin.h>
//#include "include/immintrin.h"
/** /**
\brief Performs CN processing for BG2 on the CN processing buffer and stores the results in the CN processing results buffer. \brief Performs CN processing for BG2 on the CN processing buffer and stores the results in the CN processing results buffer.
...@@ -110,12 +112,12 @@ static inline void nrLDPC_cnProc_BG2(t_nrLDPC_lut* p_lut, t_nrLDPC_procBuf* p_pr ...@@ -110,12 +112,12 @@ static inline void nrLDPC_cnProc_BG2(t_nrLDPC_lut* p_lut, t_nrLDPC_procBuf* p_pr
// Abs and sign of 64 CNS (first BN) // Abs and sign of 64 CNS (first BN)
zmm0 = p_cnProcBuf[lut_idxCnProcG3[j][0] + i]; zmm0 = p_cnProcBuf[lut_idxCnProcG3[j][0] + i];
sgn = mm512_sign_epi16(*p_ones, zmm0); sgn = mm512_sign_epi16(*p_ones, zmm0);
min = _m512_abs_epi8(zmm0); min = _mm512_abs_epi8(zmm0);
// 64 CNS of second BN // 64 CNS of second BN
zmm0 = p_cnProcBuf[lut_idxCnProcG3[j][1] + i]; zmm0 = p_cnProcBuf[lut_idxCnProcG3[j][1] + i];
min = _mm512_min_epu8(min, _m512_abs_epi8(zmm0)); min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0));
sgn = mm512_sign_epi16(sgn, zmm0); sgn = mm512_sign_epi16(sgn, zmm0);
// Store result // Store result
...@@ -157,13 +159,13 @@ static inline void nrLDPC_cnProc_BG2(t_nrLDPC_lut* p_lut, t_nrLDPC_procBuf* p_pr ...@@ -157,13 +159,13 @@ static inline void nrLDPC_cnProc_BG2(t_nrLDPC_lut* p_lut, t_nrLDPC_procBuf* p_pr
// Abs and sign of 64 CNS (first BN) // Abs and sign of 64 CNS (first BN)
zmm0 = p_cnProcBuf[lut_idxCnProcG4[j][0] + i]; zmm0 = p_cnProcBuf[lut_idxCnProcG4[j][0] + i];
sgn = mm512_sign_epi16(*p_ones, zmm0); sgn = mm512_sign_epi16(*p_ones, zmm0);
min = _m512_abs_epi8(zmm0); min = _mm512_abs_epi8(zmm0);
// Loop over BNs // Loop over BNs
for (k=1; k<3; k++) for (k=1; k<3; k++)
{ {
zmm0 = p_cnProcBuf[lut_idxCnProcG4[j][k] + i]; zmm0 = p_cnProcBuf[lut_idxCnProcG4[j][k] + i];
min = _mm512_min_epu8(min, _m512_abs_epi8(zmm0)); min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0));
sgn = mm512_sign_epi16(sgn, zmm0); sgn = mm512_sign_epi16(sgn, zmm0);
} }
...@@ -206,13 +208,13 @@ static inline void nrLDPC_cnProc_BG2(t_nrLDPC_lut* p_lut, t_nrLDPC_procBuf* p_pr ...@@ -206,13 +208,13 @@ static inline void nrLDPC_cnProc_BG2(t_nrLDPC_lut* p_lut, t_nrLDPC_procBuf* p_pr
// Abs and sign of 64 CNS (first BN) // Abs and sign of 64 CNS (first BN)
zmm0 = p_cnProcBuf[lut_idxCnProcG5[j][0] + i]; zmm0 = p_cnProcBuf[lut_idxCnProcG5[j][0] + i];
sgn = mm512_sign_epi16(*p_ones, zmm0); sgn = mm512_sign_epi16(*p_ones, zmm0);
min = _m512_abs_epi8(zmm0); min = _mm512_abs_epi8(zmm0);
// Loop over BNs // Loop over BNs
for (k=1; k<4; k++) for (k=1; k<4; k++)
{ {
zmm0 = p_cnProcBuf[lut_idxCnProcG5[j][k] + i]; zmm0 = p_cnProcBuf[lut_idxCnProcG5[j][k] + i];
min = _mm512_min_epu8(min, _m512_abs_epi8(zmm0)); min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0));
sgn = mm512_sign_epi16(sgn, zmm0); sgn = mm512_sign_epi16(sgn, zmm0);
} }
...@@ -256,13 +258,13 @@ static inline void nrLDPC_cnProc_BG2(t_nrLDPC_lut* p_lut, t_nrLDPC_procBuf* p_pr ...@@ -256,13 +258,13 @@ static inline void nrLDPC_cnProc_BG2(t_nrLDPC_lut* p_lut, t_nrLDPC_procBuf* p_pr
// Abs and sign of 64 CNS (first BN) // Abs and sign of 64 CNS (first BN)
zmm0 = p_cnProcBuf[lut_idxCnProcG6[j][0] + i]; zmm0 = p_cnProcBuf[lut_idxCnProcG6[j][0] + i];
sgn = mm512_sign_epi16(*p_ones, zmm0); sgn = mm512_sign_epi16(*p_ones, zmm0);
min = _m512_abs_epi8(zmm0); min = _mm512_abs_epi8(zmm0);
// Loop over BNs // Loop over BNs
for (k=1; k<5; k++) for (k=1; k<5; k++)
{ {
zmm0 = p_cnProcBuf[lut_idxCnProcG6[j][k] + i]; zmm0 = p_cnProcBuf[lut_idxCnProcG6[j][k] + i];
min = _mm512_min_epu8(min, _m512_abs_epi8(zmm0)); min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0));
sgn = mm512_sign_epi16(sgn, zmm0); sgn = mm512_sign_epi16(sgn, zmm0);
} }
...@@ -307,13 +309,13 @@ static inline void nrLDPC_cnProc_BG2(t_nrLDPC_lut* p_lut, t_nrLDPC_procBuf* p_pr ...@@ -307,13 +309,13 @@ static inline void nrLDPC_cnProc_BG2(t_nrLDPC_lut* p_lut, t_nrLDPC_procBuf* p_pr
// Abs and sign of 64 CNS (first BN) // Abs and sign of 64 CNS (first BN)
zmm0 = p_cnProcBuf[lut_idxCnProcG8[j][0] + i]; zmm0 = p_cnProcBuf[lut_idxCnProcG8[j][0] + i];
sgn = mm512_sign_epi16(*p_ones, zmm0); sgn = mm512_sign_epi16(*p_ones, zmm0);
min = _m512_abs_epi8(zmm0); min = _mm512_abs_epi8(zmm0);
// Loop over BNs // Loop over BNs
for (k=1; k<7; k++) for (k=1; k<7; k++)
{ {
zmm0 = p_cnProcBuf[lut_idxCnProcG8[j][k] + i]; zmm0 = p_cnProcBuf[lut_idxCnProcG8[j][k] + i];
min = _mm512_min_epu8(min, _m512_abs_epi8(zmm0)); min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0));
sgn = mm512_sign_epi16(sgn, zmm0); sgn = mm512_sign_epi16(sgn, zmm0);
} }
...@@ -359,13 +361,13 @@ static inline void nrLDPC_cnProc_BG2(t_nrLDPC_lut* p_lut, t_nrLDPC_procBuf* p_pr ...@@ -359,13 +361,13 @@ static inline void nrLDPC_cnProc_BG2(t_nrLDPC_lut* p_lut, t_nrLDPC_procBuf* p_pr
// Abs and sign of 64 CNS (first BN) // Abs and sign of 64 CNS (first BN)
zmm0 = p_cnProcBuf[lut_idxCnProcG10[j][0] + i]; zmm0 = p_cnProcBuf[lut_idxCnProcG10[j][0] + i];
sgn = mm512_sign_epi16(*p_ones, zmm0); sgn = mm512_sign_epi16(*p_ones, zmm0);
min = _m512_abs_epi8(zmm0); min = _mm512_abs_epi8(zmm0);
// Loop over BNs // Loop over BNs
for (k=1; k<9; k++) for (k=1; k<9; k++)
{ {
zmm0 = p_cnProcBuf[lut_idxCnProcG10[j][k] + i]; zmm0 = p_cnProcBuf[lut_idxCnProcG10[j][k] + i];
min = _mm512_min_epu8(min, _m512_abs_epi8(zmm0)); min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0));
sgn = mm512_sign_epi16(sgn, zmm0); sgn = mm512_sign_epi16(sgn, zmm0);
} }
...@@ -442,12 +444,12 @@ static inline void nrLDPC_cnProc_BG1(t_nrLDPC_lut* p_lut, t_nrLDPC_procBuf* p_pr ...@@ -442,12 +444,12 @@ static inline void nrLDPC_cnProc_BG1(t_nrLDPC_lut* p_lut, t_nrLDPC_procBuf* p_pr
{ {
// Abs and sign of 64 CNS (first BN) // Abs and sign of 64 CNS (first BN)
zmm0 = p_cnProcBuf[lut_idxCnProcG3[j][0] + i]; zmm0 = p_cnProcBuf[lut_idxCnProcG3[j][0] + i];
sgn = mm512_sign_epi16(*p_ones, zmm0); revenir sgn = mm512_sign_epi16(*p_ones, zmm0);
min = _m512_abs_epi8(zmm0); min = _mm512_abs_epi8(zmm0);
// 64 CNS of second BN // 64 CNS of second BN
zmm0 = p_cnProcBuf[lut_idxCnProcG3[j][1] + i]; zmm0 = p_cnProcBuf[lut_idxCnProcG3[j][1] + i];
min = _mm512_min_epu8(min, _m512_abs_epi8(zmm0)); min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0));
sgn = mm512_sign_epi16(sgn, zmm0); sgn = mm512_sign_epi16(sgn, zmm0);
// Store result // Store result
...@@ -489,13 +491,13 @@ static inline void nrLDPC_cnProc_BG1(t_nrLDPC_lut* p_lut, t_nrLDPC_procBuf* p_pr ...@@ -489,13 +491,13 @@ static inline void nrLDPC_cnProc_BG1(t_nrLDPC_lut* p_lut, t_nrLDPC_procBuf* p_pr
// Abs and sign of 64 CNS (first BN) // Abs and sign of 64 CNS (first BN)
zmm0 = p_cnProcBuf[lut_idxCnProcG4[j][0] + i]; zmm0 = p_cnProcBuf[lut_idxCnProcG4[j][0] + i];
sgn = mm512_sign_epi16(*p_ones, zmm0); sgn = mm512_sign_epi16(*p_ones, zmm0);
min = _m512_abs_epi8(zmm0); min = _mm512_abs_epi8(zmm0);
// Loop over BNs // Loop over BNs
for (k=1; k<3; k++) for (k=1; k<3; k++)
{ {
zmm0 = p_cnProcBuf[lut_idxCnProcG4[j][k] + i]; zmm0 = p_cnProcBuf[lut_idxCnProcG4[j][k] + i];
min = _mm512_min_epu8(min, _m512_abs_epi8(zmm0)); min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0));
sgn = mm512_sign_epi16(sgn, zmm0); sgn = mm512_sign_epi16(sgn, zmm0);
} }
...@@ -539,13 +541,13 @@ static inline void nrLDPC_cnProc_BG1(t_nrLDPC_lut* p_lut, t_nrLDPC_procBuf* p_pr ...@@ -539,13 +541,13 @@ static inline void nrLDPC_cnProc_BG1(t_nrLDPC_lut* p_lut, t_nrLDPC_procBuf* p_pr
// Abs and sign of 64 CNS (first BN) // Abs and sign of 64 CNS (first BN)
zmm0 = p_cnProcBuf[lut_idxCnProcG5[j][0] + i]; zmm0 = p_cnProcBuf[lut_idxCnProcG5[j][0] + i];
sgn = mm512_sign_epi16(*p_ones, zmm0); sgn = mm512_sign_epi16(*p_ones, zmm0);
min = _m512_abs_epi8(zmm0); min = _mm512_abs_epi8(zmm0);
// Loop over BNs // Loop over BNs
for (k=1; k<4; k++) for (k=1; k<4; k++)
{ {
zmm0 = p_cnProcBuf[lut_idxCnProcG5[j][k] + i]; zmm0 = p_cnProcBuf[lut_idxCnProcG5[j][k] + i];
min = _mm512_min_epu8(min, _m512_abs_epi8(zmm0)); min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0));
sgn = mm512_sign_epi16(sgn, zmm0); sgn = mm512_sign_epi16(sgn, zmm0);
} }
...@@ -590,13 +592,13 @@ static inline void nrLDPC_cnProc_BG1(t_nrLDPC_lut* p_lut, t_nrLDPC_procBuf* p_pr ...@@ -590,13 +592,13 @@ static inline void nrLDPC_cnProc_BG1(t_nrLDPC_lut* p_lut, t_nrLDPC_procBuf* p_pr
// Abs and sign of 64 CNS (first BN) // Abs and sign of 64 CNS (first BN)
zmm0 = p_cnProcBuf[lut_idxCnProcG6[j][0] + i]; zmm0 = p_cnProcBuf[lut_idxCnProcG6[j][0] + i];
sgn = mm512_sign_epi16(*p_ones, zmm0); sgn = mm512_sign_epi16(*p_ones, zmm0);
min = _m512_abs_epi8(zmm0); min = _mm512_abs_epi8(zmm0);
// Loop over BNs // Loop over BNs
for (k=1; k<5; k++) for (k=1; k<5; k++)
{ {
zmm0 = p_cnProcBuf[lut_idxCnProcG6[j][k] + i]; zmm0 = p_cnProcBuf[lut_idxCnProcG6[j][k] + i];
min = _mm512_min_epu8(min, _m512_abs_epi8(zmm0)); min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0));
sgn = mm512_sign_epi16(sgn, zmm0); sgn = mm512_sign_epi16(sgn, zmm0);
} }
...@@ -642,13 +644,13 @@ static inline void nrLDPC_cnProc_BG1(t_nrLDPC_lut* p_lut, t_nrLDPC_procBuf* p_pr ...@@ -642,13 +644,13 @@ static inline void nrLDPC_cnProc_BG1(t_nrLDPC_lut* p_lut, t_nrLDPC_procBuf* p_pr
// Abs and sign of 64 CNS (first BN) // Abs and sign of 64 CNS (first BN)
zmm0 = p_cnProcBuf[lut_idxCnProcG7[j][0] + i]; zmm0 = p_cnProcBuf[lut_idxCnProcG7[j][0] + i];
sgn = mm512_sign_epi16(*p_ones, zmm0); sgn = mm512_sign_epi16(*p_ones, zmm0);
min = _m512_abs_epi8(zmm0); min = _mm512_abs_epi8(zmm0);
// Loop over BNs // Loop over BNs
for (k=1; k<6; k++) for (k=1; k<6; k++)
{ {
zmm0 = p_cnProcBuf[lut_idxCnProcG7[j][k] + i]; zmm0 = p_cnProcBuf[lut_idxCnProcG7[j][k] + i];
min = _mm512_min_epu8(min, _m512_abs_epi8(zmm0)); min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0));
sgn = mm512_sign_epi16(sgn, zmm0); sgn = mm512_sign_epi16(sgn, zmm0);
} }
...@@ -694,13 +696,13 @@ static inline void nrLDPC_cnProc_BG1(t_nrLDPC_lut* p_lut, t_nrLDPC_procBuf* p_pr ...@@ -694,13 +696,13 @@ static inline void nrLDPC_cnProc_BG1(t_nrLDPC_lut* p_lut, t_nrLDPC_procBuf* p_pr
// Abs and sign of 64 CNS (first BN) // Abs and sign of 64 CNS (first BN)
zmm0 = p_cnProcBuf[lut_idxCnProcG8[j][0] + i]; zmm0 = p_cnProcBuf[lut_idxCnProcG8[j][0] + i];
sgn = mm512_sign_epi16(*p_ones, zmm0); sgn = mm512_sign_epi16(*p_ones, zmm0);
min = _m512_abs_epi8(zmm0); min = _mm512_abs_epi8(zmm0);
// Loop over BNs // Loop over BNs
for (k=1; k<7; k++) for (k=1; k<7; k++)
{ {
zmm0 = p_cnProcBuf[lut_idxCnProcG8[j][k] + i]; zmm0 = p_cnProcBuf[lut_idxCnProcG8[j][k] + i];
min = _mm512_min_epu8(min, _m512_abs_epi8(zmm0)); min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0));
sgn = mm512_sign_epi16(sgn, zmm0); sgn = mm512_sign_epi16(sgn, zmm0);
} }
...@@ -747,13 +749,13 @@ static inline void nrLDPC_cnProc_BG1(t_nrLDPC_lut* p_lut, t_nrLDPC_procBuf* p_pr ...@@ -747,13 +749,13 @@ static inline void nrLDPC_cnProc_BG1(t_nrLDPC_lut* p_lut, t_nrLDPC_procBuf* p_pr
// Abs and sign of 64 CNS (first BN) // Abs and sign of 64 CNS (first BN)
zmm0 = p_cnProcBuf[lut_idxCnProcG9[j][0] + i]; zmm0 = p_cnProcBuf[lut_idxCnProcG9[j][0] + i];
sgn = mm512_sign_epi16(*p_ones, zmm0); sgn = mm512_sign_epi16(*p_ones, zmm0);
min = _m512_abs_epi8(zmm0); min = _mm512_abs_epi8(zmm0);
// Loop over BNs // Loop over BNs
for (k=1; k<8; k++) for (k=1; k<8; k++)
{ {
zmm0 = p_cnProcBuf[lut_idxCnProcG9[j][k] + i]; zmm0 = p_cnProcBuf[lut_idxCnProcG9[j][k] + i];
min = _mm512_min_epu8(min, _m512_abs_epi8(zmm0)); min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0));
sgn = mm512_sign_epi16(sgn, zmm0); sgn = mm512_sign_epi16(sgn, zmm0);
} }
...@@ -800,13 +802,13 @@ static inline void nrLDPC_cnProc_BG1(t_nrLDPC_lut* p_lut, t_nrLDPC_procBuf* p_pr ...@@ -800,13 +802,13 @@ static inline void nrLDPC_cnProc_BG1(t_nrLDPC_lut* p_lut, t_nrLDPC_procBuf* p_pr
// Abs and sign of 64 CNS (first BN) // Abs and sign of 64 CNS (first BN)
zmm0 = p_cnProcBuf[lut_idxCnProcG10[j][0] + i]; zmm0 = p_cnProcBuf[lut_idxCnProcG10[j][0] + i];
sgn = mm512_sign_epi16(*p_ones, zmm0); sgn = mm512_sign_epi16(*p_ones, zmm0);
min = _m512_abs_epi8(zmm0); min = _mm512_abs_epi8(zmm0);
// Loop over BNs // Loop over BNs
for (k=1; k<9; k++) for (k=1; k<9; k++)
{ {
zmm0 = p_cnProcBuf[lut_idxCnProcG10[j][k] + i]; zmm0 = p_cnProcBuf[lut_idxCnProcG10[j][k] + i];
min = _mm512_min_epu8(min, _m512_abs_epi8(zmm0)); min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0));
sgn = mm512_sign_epi16(sgn, zmm0); sgn = mm512_sign_epi16(sgn, zmm0);
} }
...@@ -857,13 +859,13 @@ static inline void nrLDPC_cnProc_BG1(t_nrLDPC_lut* p_lut, t_nrLDPC_procBuf* p_pr ...@@ -857,13 +859,13 @@ static inline void nrLDPC_cnProc_BG1(t_nrLDPC_lut* p_lut, t_nrLDPC_procBuf* p_pr
// Abs and sign of 64 CNS (first BN) // Abs and sign of 64 CNS (first BN)
zmm0 = p_cnProcBuf[lut_idxCnProcG19[j][0] + i]; zmm0 = p_cnProcBuf[lut_idxCnProcG19[j][0] + i];
sgn = mm512_sign_epi16(*p_ones, zmm0); sgn = mm512_sign_epi16(*p_ones, zmm0);
min = _m512_abs_epi8(zmm0); min = _mm512_abs_epi8(zmm0);
// Loop over BNs // Loop over BNs
for (k=1; k<18; k++) for (k=1; k<18; k++)
{ {
zmm0 = p_cnProcBuf[lut_idxCnProcG19[j][k] + i]; zmm0 = p_cnProcBuf[lut_idxCnProcG19[j][k] + i];
min = _mm512_min_epu8(min, _m512_abs_epi8(zmm0)); min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0));
sgn = mm512_sign_epi16(sgn, zmm0); sgn = mm512_sign_epi16(sgn, zmm0);
} }
...@@ -938,7 +940,7 @@ static inline uint64_t nrLDPC_cnProcPc_BG1(t_nrLDPC_lut* p_lut, t_nrLDPC_procBuf ...@@ -938,7 +940,7 @@ static inline uint64_t nrLDPC_cnProcPc_BG1(t_nrLDPC_lut* p_lut, t_nrLDPC_procBuf
// Add BN and input LLR, extract the sign bit // Add BN and input LLR, extract the sign bit
// and add in GF(2) (xor) // and add in GF(2) (xor)
pcRes ^= _mm512_movemask_epi8(_mm512_adds_epi8(ymm0,ymm1)); pcRes ^= _mm512_movemask_epi8(_mm512_adds_epi8(zmm0,zmm1));
} }
// If no error pcRes should be 0 // If no error pcRes should be 0
...@@ -1072,7 +1074,7 @@ static inline uint64_t nrLDPC_cnProcPc_BG1(t_nrLDPC_lut* p_lut, t_nrLDPC_procBuf ...@@ -1072,7 +1074,7 @@ static inline uint64_t nrLDPC_cnProcPc_BG1(t_nrLDPC_lut* p_lut, t_nrLDPC_procBuf
// Add BN and input LLR, extract the sign bit // Add BN and input LLR, extract the sign bit
// and add in GF(2) (xor) // and add in GF(2) (xor)
pcRes ^= _mm512_movemask_epi8(_mm512_adds_epi8(ymm0,ymm1)); pcRes ^= _mm512_movemask_epi8(_mm512_adds_epi8(zmm0,zmm1));
} }
// If no error pcRes should be 0 // If no error pcRes should be 0
...@@ -1226,7 +1228,7 @@ static inline uint64_t nrLDPC_cnProcPc_BG1(t_nrLDPC_lut* p_lut, t_nrLDPC_procBuf ...@@ -1226,7 +1228,7 @@ static inline uint64_t nrLDPC_cnProcPc_BG1(t_nrLDPC_lut* p_lut, t_nrLDPC_procBuf
// Add BN and input LLR, extract the sign bit // Add BN and input LLR, extract the sign bit
// and add in GF(2) (xor) // and add in GF(2) (xor)
pcRes ^= _mm512_movemask_epi8(_mm512_adds_epi8(ymm0,ymm1)); pcRes ^= _mm512_movemask_epi8(_mm512_adds_epi8(zmm0,zmm1));
} }
// If no error pcRes should be 0 // If no error pcRes should be 0
...@@ -1341,7 +1343,7 @@ static inline uint64_t nrLDPC_cnProcPc_BG1(t_nrLDPC_lut* p_lut, t_nrLDPC_procBuf ...@@ -1341,7 +1343,7 @@ static inline uint64_t nrLDPC_cnProcPc_BG1(t_nrLDPC_lut* p_lut, t_nrLDPC_procBuf
// Add BN and input LLR, extract the sign bit // Add BN and input LLR, extract the sign bit
// and add in GF(2) (xor) // and add in GF(2) (xor)
pcRes ^= _mm512_movemask_epi8(_mm512_adds_epi8(ymm0,ymm1)); pcRes ^= _mm512_movemask_epi8(_mm512_adds_epi8(zmm0,zmm1));
} }
// If no error pcRes should be 0 // If no error pcRes should be 0
...@@ -1408,7 +1410,7 @@ static inline uint64_t nrLDPC_cnProcPc_BG1(t_nrLDPC_lut* p_lut, t_nrLDPC_procBuf ...@@ -1408,7 +1410,7 @@ static inline uint64_t nrLDPC_cnProcPc_BG1(t_nrLDPC_lut* p_lut, t_nrLDPC_procBuf
// Add BN and input LLR, extract the sign bit // Add BN and input LLR, extract the sign bit
// and add in GF(2) (xor) // and add in GF(2) (xor)
pcRes ^= _mm512_movemask_epi8(_mm512_adds_epi8(ymm0,ymm1)); pcRes ^= _mm512_movemask_epi8(_mm512_adds_epi8(zmm0,zmm1));
} }
// If no error pcRes should be 0 // If no error pcRes should be 0
...@@ -1422,12 +1424,12 @@ static inline uint64_t nrLDPC_cnProcPc_BG1(t_nrLDPC_lut* p_lut, t_nrLDPC_procBuf ...@@ -1422,12 +1424,12 @@ static inline uint64_t nrLDPC_cnProcPc_BG1(t_nrLDPC_lut* p_lut, t_nrLDPC_procBuf
for (j=0; j<10; j++) for (j=0; j<10; j++)
{ {
// BN offset is units of 1*384/64 = 6 // BN offset is units of 1*384/64 = 6
ymm0 = p_cnProcBuf [j*6 + i]; zmm0 = p_cnProcBuf [j*6 + i];
ymm1 = p_cnProcBufRes[j*6 + i]; zmm1 = p_cnProcBufRes[j*6 + i];
// Add BN and input LLR, extract the sign bit // Add BN and input LLR, extract the sign bit
// and add in GF(2) (xor) // and add in GF(2) (xor)
pcRes ^= _mm512_movemask_epi8(_mm512_adds_epi8(ymm0,ymm1)); pcRes ^= _mm512_movemask_epi8(_mm512_adds_epi8(zmm0,zmm1));
} }
// If no error pcRes should be 0 // If no error pcRes should be 0
...@@ -1543,7 +1545,7 @@ static inline uint64_t nrLDPC_cnProcPc_BG2(t_nrLDPC_lut* p_lut, t_nrLDPC_procBuf ...@@ -1543,7 +1545,7 @@ static inline uint64_t nrLDPC_cnProcPc_BG2(t_nrLDPC_lut* p_lut, t_nrLDPC_procBuf
uint64_t Mrem; uint64_t Mrem;
uint64_ ; uint64_ ;
__m512i ymm0, ymm1; __m512i zmm0, zmm1;
// ===================================================================== // =====================================================================
// Process group with 3 BNs // Process group with 3 BNs
...@@ -1574,12 +1576,12 @@ static inline uint64_t nrLDPC_cnProcPc_BG2(t_nrLDPC_lut* p_lut, t_nrLDPC_procBuf ...@@ -1574,12 +1576,12 @@ static inline uint64_t nrLDPC_cnProcPc_BG2(t_nrLDPC_lut* p_lut, t_nrLDPC_procBuf
for (j=0; j<3; j++) for (j=0; j<3; j++)
{ {
// BN offset is units of (6*384/32) = 72 // BN offset is units of (6*384/32) = 72
ymm0 = p_cnProcBuf [j*72 + i]; zmm0 = p_cnProcBuf [j*72 + i];
ymm1 = p_cnProcBufRes[j*72 + i]; zmm1 = p_cnProcBufRes[j*72 + i];
// Add BN and input LLR, extract the sign bit // Add BN and input LLR, extract the sign bit
// and add in GF(2) (xor) // and add in GF(2) (xor)
pcRes ^= _mm512_movemask_epi8(_mm512_adds_epi8(ymm0,ymm1)); pcRes ^= _mm512_movemask_epi8(_mm512_adds_epi8(zmm0,zmm1));
} }
// If no error pcRes should be 0 // If no error pcRes should be 0
...@@ -1593,12 +1595,12 @@ static inline uint64_t nrLDPC_cnProcPc_BG2(t_nrLDPC_lut* p_lut, t_nrLDPC_procBuf ...@@ -1593,12 +1595,12 @@ static inline uint64_t nrLDPC_cnProcPc_BG2(t_nrLDPC_lut* p_lut, t_nrLDPC_procBuf
for (j=0; j<3; j++) for (j=0; j<3; j++)
{ {
// BN offset is units of (6*384/32) = 72 // BN offset is units of (6*384/32) = 72
ymm0 = p_cnProcBuf [j*72 + i]; zmm0 = p_cnProcBuf [j*72 + i];
ymm1 = p_cnProcBufRes[j*72 + i]; zmm1 = p_cnProcBufRes[j*72 + i];
// Add BN and input LLR, extract the sign bit // Add BN and input LLR, extract the sign bit
// and add in GF(2) (xor) // and add in GF(2) (xor)
pcRes ^= _mm512_movemask_epi8(_mm512_adds_epi8(ymm0,ymm1)); pcRes ^= _mm512_movemask_epi8(_mm512_adds_epi8(zmm0,zmm1));
} }
// If no error pcRes should be 0 // If no error pcRes should be 0
...@@ -1641,12 +1643,12 @@ static inline uint64_t nrLDPC_cnProcPc_BG2(t_nrLDPC_lut* p_lut, t_nrLDPC_procBuf ...@@ -1641,12 +1643,12 @@ static inline uint64_t nrLDPC_cnProcPc_BG2(t_nrLDPC_lut* p_lut, t_nrLDPC_procBuf
for (j=0; j<4; j++) for (j=0; j<4; j++)
{ {
// BN offset is units of 20*384/32 = 240 // BN offset is units of 20*384/32 = 240
ymm0 = p_cnProcBuf [j*240 + i]; zmm0 = p_cnProcBuf [j*240 + i];
ymm1 = p_cnProcBufRes[j*240 + i]; zmm1 = p_cnProcBufRes[j*240 + i];
// Add BN and input LLR, extract the sign bit // Add BN and input LLR, extract the sign bit
// and add in GF(2) (xor) // and add in GF(2) (xor)
pcRes ^= _mm512_movemask_epi8(_mm512_adds_epi8(ymm0,ymm1)); pcRes ^= _mm512_movemask_epi8(_mm512_adds_epi8(zmm0,zmm1));
} }
// If no error pcRes should be 0 // If no error pcRes should be 0
...@@ -1660,12 +1662,12 @@ static inline uint64_t nrLDPC_cnProcPc_BG2(t_nrLDPC_lut* p_lut, t_nrLDPC_procBuf ...@@ -1660,12 +1662,12 @@ static inline uint64_t nrLDPC_cnProcPc_BG2(t_nrLDPC_lut* p_lut, t_nrLDPC_procBuf
for (j=0; j<4; j++) for (j=0; j<4; j++)
{ {
// BN offset is units of 20*384/32 = 240 // BN offset is units of 20*384/32 = 240
ymm0 = p_cnProcBuf [j*240 + i]; zmm0 = p_cnProcBuf [j*240 + i];
ymm1 = p_cnProcBufRes[j*240 + i]; zmm1 = p_cnProcBufRes[j*240 + i];
// Add BN and input LLR, extract the sign bit // Add BN and input LLR, extract the sign bit
// and add in GF(2) (xor) // and add in GF(2) (xor)
pcRes ^= _mm512_movemask_epi8(_mm512_adds_epi8(ymm0,ymm1)); pcRes ^= _mm512_movemask_epi8(_mm512_adds_epi8(zmm0,zmm1));
} }
// If no error pcRes should be 0 // If no error pcRes should be 0
...@@ -1708,12 +1710,12 @@ static inline uint64_t nrLDPC_cnProcPc_BG2(t_nrLDPC_lut* p_lut, t_nrLDPC_procBuf ...@@ -1708,12 +1710,12 @@ static inline uint64_t nrLDPC_cnProcPc_BG2(t_nrLDPC_lut* p_lut, t_nrLDPC_procBuf
for (j=0; j<5; j++) for (j=0; j<5; j++)
{ {
// BN offset is units of 9*384/32 = 108 // BN offset is units of 9*384/32 = 108
ymm0 = p_cnProcBuf [j*108 + i]; zmm0 = p_cnProcBuf [j*108 + i];
ymm1 = p_cnProcBufRes[j*108 + i]; zmm1 = p_cnProcBufRes[j*108 + i];
// Add BN and input LLR, extract the sign bit // Add BN and input LLR, extract the sign bit
// and add in GF(2) (xor) // and add in GF(2) (xor)
pcRes ^= _mm512_movemask_epi8(_mm512_adds_epi8(ymm0,ymm1)); pcRes ^= _mm512_movemask_epi8(_mm512_adds_epi8(zmm0,zmm1));
} }
// If no error pcRes should be 0 // If no error pcRes should be 0
...@@ -1727,12 +1729,12 @@ static inline uint64_t nrLDPC_cnProcPc_BG2(t_nrLDPC_lut* p_lut, t_nrLDPC_procBuf ...@@ -1727,12 +1729,12 @@ static inline uint64_t nrLDPC_cnProcPc_BG2(t_nrLDPC_lut* p_lut, t_nrLDPC_procBuf
for (j=0; j<5; j++) for (j=0; j<5; j++)
{ {
// BN offset is units of 9*384/32 = 108 // BN offset is units of 9*384/32 = 108
ymm0 = p_cnProcBuf [j*108 + i]; zmm0 = p_cnProcBuf [j*108 + i];
ymm1 = p_cnProcBufRes[j*108 + i]; zmm1 = p_cnProcBufRes[j*108 + i];
// Add BN and input LLR, extract the sign bit // Add BN and input LLR, extract the sign bit
// and add in GF(2) (xor) // and add in GF(2) (xor)
pcRes ^= _mm512_movemask_epi8(_mm512_adds_epi8(ymm0,ymm1)); pcRes ^= _mm512_movemask_epi8(_mm512_adds_epi8(zmm0,zmm1));
} }
// If no error pcRes should be 0 // If no error pcRes should be 0
...@@ -1775,12 +1777,12 @@ static inline uint64_t nrLDPC_cnProcPc_BG2(t_nrLDPC_lut* p_lut, t_nrLDPC_procBuf ...@@ -1775,12 +1777,12 @@ static inline uint64_t nrLDPC_cnProcPc_BG2(t_nrLDPC_lut* p_lut, t_nrLDPC_procBuf
for (j=0; j<6; j++) for (j=0; j<6; j++)
{ {
// BN offset is units of 3*384/32 = 36 // BN offset is units of 3*384/32 = 36
ymm0 = p_cnProcBuf [j*36 + i]; zmm0 = p_cnProcBuf [j*36 + i];
ymm1 = p_cnProcBufRes[j*36 + i]; zmm1 = p_cnProcBufRes[j*36 + i];
// Add BN and input LLR, extract the sign bit // Add BN and input LLR, extract the sign bit
// and add in GF(2) (xor) // and add in GF(2) (xor)
pcRes ^= _mm512_movemask_epi8(_mm512_adds_epi8(ymm0,ymm1)); pcRes ^= _mm512_movemask_epi8(_mm512_adds_epi8(zmm0,zmm1));
} }
// If no error pcRes should be 0 // If no error pcRes should be 0
...@@ -1794,12 +1796,12 @@ static inline uint64_t nrLDPC_cnProcPc_BG2(t_nrLDPC_lut* p_lut, t_nrLDPC_procBuf ...@@ -1794,12 +1796,12 @@ static inline uint64_t nrLDPC_cnProcPc_BG2(t_nrLDPC_lut* p_lut, t_nrLDPC_procBuf
for (j=0; j<6; j++) for (j=0; j<6; j++)
{ {
// BN offset is units of 3*384/32 = 36 // BN offset is units of 3*384/32 = 36
ymm0 = p_cnProcBuf [j*36 + i]; zmm0 = p_cnProcBuf [j*36 + i];
ymm1 = p_cnProcBufRes[j*36 + i]; zmm1 = p_cnProcBufRes[j*36 + i];
// Add BN and input LLR, extract the sign bit // Add BN and input LLR, extract the sign bit
// and add in GF(2) (xor) // and add in GF(2) (xor)
pcRes ^= _mm512_movemask_epi8(_mm512_adds_epi8(ymm0,ymm1)); pcRes ^= _mm512_movemask_epi8(_mm512_adds_epi8(zmm0,zmm1));
} }
// If no error pcRes should be 0 // If no error pcRes should be 0
...@@ -1842,12 +1844,12 @@ static inline uint64_t nrLDPC_cnProcPc_BG2(t_nrLDPC_lut* p_lut, t_nrLDPC_procBuf ...@@ -1842,12 +1844,12 @@ static inline uint64_t nrLDPC_cnProcPc_BG2(t_nrLDPC_lut* p_lut, t_nrLDPC_procBuf
for (j=0; j<8; j++) for (j=0; j<8; j++)
{ {
// BN offset is units of 2*384/32 = 24 // BN offset is units of 2*384/32 = 24
ymm0 = p_cnProcBuf [j*24 + i]; zmm0 = p_cnProcBuf [j*24 + i];
ymm1 = p_cnProcBufRes[j*24 + i]; zmm1 = p_cnProcBufRes[j*24 + i];
// Add BN and input LLR, extract the sign bit // Add BN and input LLR, extract the sign bit
// and add in GF(2) (xor) // and add in GF(2) (xor)
pcRes ^= _mm512_movemask_epi8(_mm512_adds_epi8(ymm0,ymm1)); pcRes ^= _mm512_movemask_epi8(_mm512_adds_epi8(zmm0,zmm1));
} }
// If no error pcRes should be 0 // If no error pcRes should be 0
...@@ -1861,12 +1863,12 @@ static inline uint64_t nrLDPC_cnProcPc_BG2(t_nrLDPC_lut* p_lut, t_nrLDPC_procBuf ...@@ -1861,12 +1863,12 @@ static inline uint64_t nrLDPC_cnProcPc_BG2(t_nrLDPC_lut* p_lut, t_nrLDPC_procBuf
for (j=0; j<8; j++) for (j=0; j<8; j++)
{ {
// BN offset is units of 2*384/32 = 24 // BN offset is units of 2*384/32 = 24
ymm0 = p_cnProcBuf [j*24 + i]; zmm0 = p_cnProcBuf [j*24 + i];
ymm1 = p_cnProcBufRes[j*24 + i]; zmm1 = p_cnProcBufRes[j*24 + i];
// Add BN and input LLR, extract the sign bit // Add BN and input LLR, extract the sign bit
// and add in GF(2) (xor) // and add in GF(2) (xor)
pcRes ^= _mm512_movemask_epi8(_mm512_adds_epi8(ymm0,ymm1)); pcRes ^= _mm512_movemask_epi8(_mm512_adds_epi8(zmm0,zmm1));
} }
// If no error pcRes should be 0 // If no error pcRes should be 0
...@@ -1909,12 +1911,12 @@ static inline uint64_t nrLDPC_cnProcPc_BG2(t_nrLDPC_lut* p_lut, t_nrLDPC_procBuf ...@@ -1909,12 +1911,12 @@ static inline uint64_t nrLDPC_cnProcPc_BG2(t_nrLDPC_lut* p_lut, t_nrLDPC_procBuf
for (j=0; j<10; j++) for (j=0; j<10; j++)
{ {
// BN offset is units of 2*384/32 = 24 // BN offset is units of 2*384/32 = 24
ymm0 = p_cnProcBuf [j*24 + i]; zmm0 = p_cnProcBuf [j*24 + i];
ymm1 = p_cnProcBufRes[j*24 + i]; zmm1 = p_cnProcBufRes[j*24 + i];
// Add BN and input LLR, extract the sign bit // Add BN and input LLR, extract the sign bit
// and add in GF(2) (xor) // and add in GF(2) (xor)
pcRes ^= _mm512_movemask_epi8(_mm512_adds_epi8(ymm0,ymm1)); pcRes ^= _mm512_movemask_epi8(_mm512_adds_epi8(zmm0,zmm1));
} }
// If no error pcRes should be 0 // If no error pcRes should be 0
...@@ -1928,12 +1930,12 @@ static inline uint64_t nrLDPC_cnProcPc_BG2(t_nrLDPC_lut* p_lut, t_nrLDPC_procBuf ...@@ -1928,12 +1930,12 @@ static inline uint64_t nrLDPC_cnProcPc_BG2(t_nrLDPC_lut* p_lut, t_nrLDPC_procBuf
for (j=0; j<10; j++) for (j=0; j<10; j++)
{ {
// BN offset is units of 2*384/32 = 24 // BN offset is units of 2*384/32 = 24
ymm0 = p_cnProcBuf [j*24 + i]; zmm0 = p_cnProcBuf [j*24 + i];
ymm1 = p_cnProcBufRes[j*24 + i]; zmm1 = p_cnProcBufRes[j*24 + i];
// Add BN and input LLR, extract the sign bit // Add BN and input LLR, extract the sign bit
// and add in GF(2) (xor) // and add in GF(2) (xor)
pcRes ^= _mm512_movemask_epi8(_mm512_adds_epi8(ymm0,ymm1)); pcRes ^= _mm512_movemask_epi8(_mm512_adds_epi8(zmm0,zmm1));
} }
// If no error pcRes should be 0 // If no error pcRes should be 0
......
...@@ -29,17 +29,15 @@ ...@@ -29,17 +29,15 @@
*/ */
#include <stdint.h> #include <stdint.h>
//#include <immintrin.h> #include <immintrin.h>
#include "nrLDPCdecoder_defs.h" #include "nrLDPCdecoder_defs.h"
#include "nrLDPC_types.h" #include "nrLDPC_types.h"
#include "nrLDPC_init.h" #include "nrLDPC_init.h"
#include "nrLDPC_mPass.h" #include "nrLDPC_mPass.h"
#include "nrLDPC_cnProc.h" #include "nrLDPC_cnProc.h"
#include "nrLDPC_bnProc.h" #include "nrLDPC_bnProc.h"
#include <avx512fintrin.h>
//#include "include/immintrin.h" //#include "include/immintrin.h"
#include "include/avx512fintrin.h"
#include "include/avx2intrin.h"
#define NR_LDPC_ENABLE_PARITY_CHECK #define NR_LDPC_ENABLE_PARITY_CHECK
#define NR_LDPC_PROFILER_DETAIL #define NR_LDPC_PROFILER_DETAIL
......
Markdown is supported
0%
or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment