Commit b85b5fe4 authored by Sy's avatar Sy

Test for avx512

parent 3e27bed4
...@@ -80,10 +80,10 @@ void nrLDPC_cnProc_BG1_generator_AVX512(uint16_t Z,int R) ...@@ -80,10 +80,10 @@ void nrLDPC_cnProc_BG1_generator_AVX512(uint16_t Z,int R)
{ {
// Number of groups of 64 CNs for parallel processing // Number of groups of 64 CNs for parallel processing
// Ceil for values not divisible by 64 // Ceil for values not divisible by 64
M = (lut_numCnInCnGroups[0]*Z + 63)>>5; M = (lut_numCnInCnGroups[0]*Z + 63)>>31;
// Set the offset to each bit within a group in terms of 64 Byte // Set the offset to each bit within a group in terms of 64 Byte
bitOffsetInGroup = (lut_numCnInCnGroups_BG1_R13[0]*NR_LDPC_ZMAX)>>5; bitOffsetInGroup = (lut_numCnInCnGroups_BG1_R13[0]*NR_LDPC_ZMAX)>>31;
// Set pointers to start of group 3 // Set pointers to start of group 3
...@@ -104,7 +104,7 @@ void nrLDPC_cnProc_BG1_generator_AVX512(uint16_t Z,int R) ...@@ -104,7 +104,7 @@ void nrLDPC_cnProc_BG1_generator_AVX512(uint16_t Z,int R)
fprintf(fd," for (int i=0;i<%d;i+=2) {\n",M); fprintf(fd," for (int i=0;i<%d;i+=2) {\n",M);
// Abs and sign of 64 CNs (first BN) // Abs and sign of 64 CNs (first BN)
// zmm0 = p_cnProcBuf[lut_idxCnProcG3[j][0] + i]; // zmm0 = p_cnProcBuf[lut_idxCnProcG3[j][0] + i];
fprintf(fd," zmm0 = ((__m512i*)cnProcBuf)[%d+i];\n",(lut_startAddrCnGroups[0]>>5)+lut_idxCnProcG3[j][0]); fprintf(fd," zmm0 = ((__m512i*)cnProcBuf)[%d+i];\n",(lut_startAddrCnGroups[0]>>31)+lut_idxCnProcG3[j][0]);
// sgn = _mm512_sign_epi16(ones, zmm0); // sgn = _mm512_sign_epi16(ones, zmm0);
fprintf(fd," sgn = _mm512_sign_epi16(ones, zmm0);\n"); fprintf(fd," sgn = _mm512_sign_epi16(ones, zmm0);\n");
// min = _mm512_abs_epi8(zmm0); // min = _mm512_abs_epi8(zmm0);
...@@ -112,7 +112,7 @@ void nrLDPC_cnProc_BG1_generator_AVX512(uint16_t Z,int R) ...@@ -112,7 +112,7 @@ void nrLDPC_cnProc_BG1_generator_AVX512(uint16_t Z,int R)
// 32 CNs of second BN // 32 CNs of second BN
// zmm0 = p_cnProcBuf[lut_idxCnProcG3[j][1] + i]; // zmm0 = p_cnProcBuf[lut_idxCnProcG3[j][1] + i];
fprintf(fd," zmm0 = ((__m512i*)cnProcBuf)[%d+i];\n",(lut_startAddrCnGroups[0]>>5)+lut_idxCnProcG3[j][1]); fprintf(fd," zmm0 = ((__m512i*)cnProcBuf)[%d+i];\n",(lut_startAddrCnGroups[0]>>31)+lut_idxCnProcG3[j][1]);
// min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); // min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0));
fprintf(fd," min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0));\n"); fprintf(fd," min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0));\n");
...@@ -125,11 +125,11 @@ void nrLDPC_cnProc_BG1_generator_AVX512(uint16_t Z,int R) ...@@ -125,11 +125,11 @@ void nrLDPC_cnProc_BG1_generator_AVX512(uint16_t Z,int R)
fprintf(fd," min = _mm512_min_epu8(min, maxLLR);\n"); fprintf(fd," min = _mm512_min_epu8(min, maxLLR);\n");
// *p_cnProcBufResBit = _mm512_sign_epi16(min, sgn); // *p_cnProcBufResBit = _mm512_sign_epi16(min, sgn);
// p_cnProcBufResBit++; // p_cnProcBufResBit++;
fprintf(fd," ((__m512i*)cnProcBufRes)[%d+i] = _mm512_sign_epi16(min, sgn);\n",(lut_startAddrCnGroups[0]>>5)+(j*bitOffsetInGroup)); fprintf(fd," ((__m512i*)cnProcBufRes)[%d+i] = _mm512_sign_epi16(min, sgn);\n",(lut_startAddrCnGroups[0]>>31)+(j*bitOffsetInGroup));
// Abs and sign of 64 CNs (first BN) // Abs and sign of 64 CNs (first BN)
// zmm0 = p_cnProcBuf[lut_idxCnProcG3[j][0] + i]; // zmm0 = p_cnProcBuf[lut_idxCnProcG3[j][0] + i];
fprintf(fd," zmm0 = ((__m512i*)cnProcBuf)[%d+i];\n",(lut_startAddrCnGroups[0]>>5)+lut_idxCnProcG3[j][0]+1); fprintf(fd," zmm0 = ((__m512i*)cnProcBuf)[%d+i];\n",(lut_startAddrCnGroups[0]>>31)+lut_idxCnProcG3[j][0]+1);
// sgn = _mm512_sign_epi16(ones, zmm0); // sgn = _mm512_sign_epi16(ones, zmm0);
fprintf(fd," sgn = _mm512_sign_epi16(ones, zmm0);\n"); fprintf(fd," sgn = _mm512_sign_epi16(ones, zmm0);\n");
// min = _mm512_abs_epi8(zmm0); // min = _mm512_abs_epi8(zmm0);
...@@ -137,7 +137,7 @@ void nrLDPC_cnProc_BG1_generator_AVX512(uint16_t Z,int R) ...@@ -137,7 +137,7 @@ void nrLDPC_cnProc_BG1_generator_AVX512(uint16_t Z,int R)
// 32 CNs of second BN // 32 CNs of second BN
// zmm0 = p_cnProcBuf[lut_idxCnProcG3[j][1] + i]; // zmm0 = p_cnProcBuf[lut_idxCnProcG3[j][1] + i];
fprintf(fd," zmm0 = ((__m512i*)cnProcBuf)[%d+i];\n",(lut_startAddrCnGroups[0]>>5)+lut_idxCnProcG3[j][1]+1); fprintf(fd," zmm0 = ((__m512i*)cnProcBuf)[%d+i];\n",(lut_startAddrCnGroups[0]>>31)+lut_idxCnProcG3[j][1]+1);
// min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); // min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0));
fprintf(fd," min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0));\n"); fprintf(fd," min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0));\n");
...@@ -150,7 +150,7 @@ void nrLDPC_cnProc_BG1_generator_AVX512(uint16_t Z,int R) ...@@ -150,7 +150,7 @@ void nrLDPC_cnProc_BG1_generator_AVX512(uint16_t Z,int R)
fprintf(fd," min = _mm512_min_epu8(min, maxLLR);\n"); fprintf(fd," min = _mm512_min_epu8(min, maxLLR);\n");
// *p_cnProcBufResBit = _mm512_sign_epi16(min, sgn); // *p_cnProcBufResBit = _mm512_sign_epi16(min, sgn);
// p_cnProcBufResBit++; // p_cnProcBufResBit++;
fprintf(fd," ((__m512i*)cnProcBufRes)[%d+i] = _mm512_sign_epi16(min, sgn);\n",(lut_startAddrCnGroups[0]>>5)+(j*bitOffsetInGroup)+1); fprintf(fd," ((__m512i*)cnProcBufRes)[%d+i] = _mm512_sign_epi16(min, sgn);\n",(lut_startAddrCnGroups[0]>>31)+(j*bitOffsetInGroup)+1);
fprintf(fd," }\n"); fprintf(fd," }\n");
} }
...@@ -166,10 +166,10 @@ void nrLDPC_cnProc_BG1_generator_AVX512(uint16_t Z,int R) ...@@ -166,10 +166,10 @@ void nrLDPC_cnProc_BG1_generator_AVX512(uint16_t Z,int R)
{ {
// Number of groups of 64 CNs for parallel processing // Number of groups of 64 CNs for parallel processing
// Ceil for values not divisible by 64 // Ceil for values not divisible by 64
M = (lut_numCnInCnGroups[1]*Z + 63)>>5; M = (lut_numCnInCnGroups[1]*Z + 63)>>31;
// Set the offset to each bit within a group in terms of 64 Byte // Set the offset to each bit within a group in terms of 64 Byte
bitOffsetInGroup = (lut_numCnInCnGroups_BG1_R13[1]*NR_LDPC_ZMAX)>>5; bitOffsetInGroup = (lut_numCnInCnGroups_BG1_R13[1]*NR_LDPC_ZMAX)>>31;
// Set pointers to start of group 4 // Set pointers to start of group 4
...@@ -188,7 +188,7 @@ void nrLDPC_cnProc_BG1_generator_AVX512(uint16_t Z,int R) ...@@ -188,7 +188,7 @@ void nrLDPC_cnProc_BG1_generator_AVX512(uint16_t Z,int R)
fprintf(fd," for (int i=0;i<%d;i++) {\n",M); fprintf(fd," for (int i=0;i<%d;i++) {\n",M);
// Abs and sign of 64 CNs (first BN) // Abs and sign of 64 CNs (first BN)
// zmm0 = p_cnProcBuf[lut_idxCnProcG3[j][0] + i]; // zmm0 = p_cnProcBuf[lut_idxCnProcG3[j][0] + i];
fprintf(fd," zmm0 = ((__m512i*)cnProcBuf)[%d+i];\n",(lut_startAddrCnGroups[1]>>5)+lut_idxCnProcG4[j][0]); fprintf(fd," zmm0 = ((__m512i*)cnProcBuf)[%d+i];\n",(lut_startAddrCnGroups[1]>>31)+lut_idxCnProcG4[j][0]);
// sgn = _mm512_sign_epi16(ones, zmm0); // sgn = _mm512_sign_epi16(ones, zmm0);
fprintf(fd," sgn = _mm512_sign_epi16(ones, zmm0);\n"); fprintf(fd," sgn = _mm512_sign_epi16(ones, zmm0);\n");
// min = _mm512_abs_epi8(zmm0); // min = _mm512_abs_epi8(zmm0);
...@@ -198,7 +198,7 @@ void nrLDPC_cnProc_BG1_generator_AVX512(uint16_t Z,int R) ...@@ -198,7 +198,7 @@ void nrLDPC_cnProc_BG1_generator_AVX512(uint16_t Z,int R)
// Loop over BNs // Loop over BNs
for (k=1; k<3; k++) for (k=1; k<3; k++)
{ {
fprintf(fd," zmm0 = ((__m512i*)cnProcBuf)[%d+i];\n",(lut_startAddrCnGroups[1]>>5)+lut_idxCnProcG4[j][k]); fprintf(fd," zmm0 = ((__m512i*)cnProcBuf)[%d+i];\n",(lut_startAddrCnGroups[1]>>31)+lut_idxCnProcG4[j][k]);
// min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); // min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0));
fprintf(fd," min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0));\n"); fprintf(fd," min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0));\n");
...@@ -212,7 +212,7 @@ void nrLDPC_cnProc_BG1_generator_AVX512(uint16_t Z,int R) ...@@ -212,7 +212,7 @@ void nrLDPC_cnProc_BG1_generator_AVX512(uint16_t Z,int R)
fprintf(fd," min = _mm512_min_epu8(min, maxLLR);\n"); fprintf(fd," min = _mm512_min_epu8(min, maxLLR);\n");
// *p_cnProcBufResBit = _mm512_sign_epi16(min, sgn); // *p_cnProcBufResBit = _mm512_sign_epi16(min, sgn);
// p_cnProcBufResBit++; // p_cnProcBufResBit++;
fprintf(fd," ((__m512i*)cnProcBufRes)[%d+i] = _mm512_sign_epi16(min, sgn);\n",(lut_startAddrCnGroups[1]>>5)+(j*bitOffsetInGroup)); fprintf(fd," ((__m512i*)cnProcBufRes)[%d+i] = _mm512_sign_epi16(min, sgn);\n",(lut_startAddrCnGroups[1]>>31)+(j*bitOffsetInGroup));
fprintf(fd," }\n"); fprintf(fd," }\n");
} }
} }
...@@ -230,10 +230,10 @@ void nrLDPC_cnProc_BG1_generator_AVX512(uint16_t Z,int R) ...@@ -230,10 +230,10 @@ void nrLDPC_cnProc_BG1_generator_AVX512(uint16_t Z,int R)
{ {
// Number of groups of 64 CNs for parallel processing // Number of groups of 64 CNs for parallel processing
// Ceil for values not divisible by 64 // Ceil for values not divisible by 64
M = (lut_numCnInCnGroups[2]*Z + 63)>>5; M = (lut_numCnInCnGroups[2]*Z + 63)>>31;
// Set the offset to each bit within a group in terms of 64 Byte // Set the offset to each bit within a group in terms of 64 Byte
bitOffsetInGroup = (lut_numCnInCnGroups_BG1_R13[2]*NR_LDPC_ZMAX)>>5; bitOffsetInGroup = (lut_numCnInCnGroups_BG1_R13[2]*NR_LDPC_ZMAX)>>31;
// Set pointers to start of group 4 // Set pointers to start of group 4
...@@ -253,7 +253,7 @@ void nrLDPC_cnProc_BG1_generator_AVX512(uint16_t Z,int R) ...@@ -253,7 +253,7 @@ void nrLDPC_cnProc_BG1_generator_AVX512(uint16_t Z,int R)
fprintf(fd," for (int i=0;i<%d;i++) {\n",M); fprintf(fd," for (int i=0;i<%d;i++) {\n",M);
// Abs and sign of 64 CNs (first BN) // Abs and sign of 64 CNs (first BN)
// zmm0 = p_cnProcBuf[lut_idxCnProcG3[j][0] + i]; // zmm0 = p_cnProcBuf[lut_idxCnProcG3[j][0] + i];
fprintf(fd," zmm0 = ((__m512i*)cnProcBuf)[%d+i];\n",(lut_startAddrCnGroups[2]>>5)+lut_idxCnProcG5[j][0]); fprintf(fd," zmm0 = ((__m512i*)cnProcBuf)[%d+i];\n",(lut_startAddrCnGroups[2]>>31)+lut_idxCnProcG5[j][0]);
// sgn = _mm512_sign_epi16(ones, zmm0); // sgn = _mm512_sign_epi16(ones, zmm0);
fprintf(fd," sgn = _mm512_sign_epi16(ones, zmm0);\n"); fprintf(fd," sgn = _mm512_sign_epi16(ones, zmm0);\n");
// min = _mm512_abs_epi8(zmm0); // min = _mm512_abs_epi8(zmm0);
...@@ -263,7 +263,7 @@ void nrLDPC_cnProc_BG1_generator_AVX512(uint16_t Z,int R) ...@@ -263,7 +263,7 @@ void nrLDPC_cnProc_BG1_generator_AVX512(uint16_t Z,int R)
// Loop over BNs // Loop over BNs
for (k=1; k<4; k++) for (k=1; k<4; k++)
{ {
fprintf(fd," zmm0 = ((__m512i*)cnProcBuf)[%d+i];\n",(lut_startAddrCnGroups[2]>>5)+lut_idxCnProcG5[j][k]); fprintf(fd," zmm0 = ((__m512i*)cnProcBuf)[%d+i];\n",(lut_startAddrCnGroups[2]>>31)+lut_idxCnProcG5[j][k]);
// min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); // min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0));
fprintf(fd," min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0));\n"); fprintf(fd," min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0));\n");
...@@ -277,7 +277,7 @@ void nrLDPC_cnProc_BG1_generator_AVX512(uint16_t Z,int R) ...@@ -277,7 +277,7 @@ void nrLDPC_cnProc_BG1_generator_AVX512(uint16_t Z,int R)
fprintf(fd," min = _mm512_min_epu8(min, maxLLR);\n"); fprintf(fd," min = _mm512_min_epu8(min, maxLLR);\n");
// *p_cnProcBufResBit = _mm512_sign_epi16(min, sgn); // *p_cnProcBufResBit = _mm512_sign_epi16(min, sgn);
// p_cnProcBufResBit++; // p_cnProcBufResBit++;
fprintf(fd," ((__m512i*)cnProcBufRes)[%d+i] = _mm512_sign_epi16(min, sgn);\n",(lut_startAddrCnGroups[2]>>5)+(j*bitOffsetInGroup)); fprintf(fd," ((__m512i*)cnProcBufRes)[%d+i] = _mm512_sign_epi16(min, sgn);\n",(lut_startAddrCnGroups[2]>>31)+(j*bitOffsetInGroup));
fprintf(fd," }\n"); fprintf(fd," }\n");
} }
} }
...@@ -295,10 +295,10 @@ void nrLDPC_cnProc_BG1_generator_AVX512(uint16_t Z,int R) ...@@ -295,10 +295,10 @@ void nrLDPC_cnProc_BG1_generator_AVX512(uint16_t Z,int R)
{ {
// Number of groups of 64 CNs for parallel processing // Number of groups of 64 CNs for parallel processing
// Ceil for values not divisible by 64 // Ceil for values not divisible by 64
M = (lut_numCnInCnGroups[3]*Z + 63)>>5; M = (lut_numCnInCnGroups[3]*Z + 63)>>31;
// Set the offset to each bit within a group in terms of 64 Byte // Set the offset to each bit within a group in terms of 64 Byte
bitOffsetInGroup = (lut_numCnInCnGroups_BG1_R13[3]*NR_LDPC_ZMAX)>>5; bitOffsetInGroup = (lut_numCnInCnGroups_BG1_R13[3]*NR_LDPC_ZMAX)>>31;
// Set pointers to start of group 4 // Set pointers to start of group 4
...@@ -318,7 +318,7 @@ void nrLDPC_cnProc_BG1_generator_AVX512(uint16_t Z,int R) ...@@ -318,7 +318,7 @@ void nrLDPC_cnProc_BG1_generator_AVX512(uint16_t Z,int R)
fprintf(fd," for (int i=0;i<%d;i++) {\n",M); fprintf(fd," for (int i=0;i<%d;i++) {\n",M);
// Abs and sign of 64 CNs (first BN) // Abs and sign of 64 CNs (first BN)
// zmm0 = p_cnProcBuf[lut_idxCnProcG3[j][0] + i]; // zmm0 = p_cnProcBuf[lut_idxCnProcG3[j][0] + i];
fprintf(fd," zmm0 = ((__m512i*)cnProcBuf)[%d+i];\n",(lut_startAddrCnGroups[3]>>5)+lut_idxCnProcG6[j][0]); fprintf(fd," zmm0 = ((__m512i*)cnProcBuf)[%d+i];\n",(lut_startAddrCnGroups[3]>>31)+lut_idxCnProcG6[j][0]);
// sgn = _mm512_sign_epi16(ones, zmm0); // sgn = _mm512_sign_epi16(ones, zmm0);
fprintf(fd," sgn = _mm512_sign_epi16(ones, zmm0);\n"); fprintf(fd," sgn = _mm512_sign_epi16(ones, zmm0);\n");
// min = _mm512_abs_epi8(zmm0); // min = _mm512_abs_epi8(zmm0);
...@@ -328,7 +328,7 @@ void nrLDPC_cnProc_BG1_generator_AVX512(uint16_t Z,int R) ...@@ -328,7 +328,7 @@ void nrLDPC_cnProc_BG1_generator_AVX512(uint16_t Z,int R)
// Loop over BNs // Loop over BNs
for (k=1; k<5; k++) for (k=1; k<5; k++)
{ {
fprintf(fd," zmm0 = ((__m512i*)cnProcBuf)[%d+i];\n",(lut_startAddrCnGroups[3]>>5)+lut_idxCnProcG6[j][k]); fprintf(fd," zmm0 = ((__m512i*)cnProcBuf)[%d+i];\n",(lut_startAddrCnGroups[3]>>31)+lut_idxCnProcG6[j][k]);
// min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); // min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0));
fprintf(fd," min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0));\n"); fprintf(fd," min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0));\n");
...@@ -342,7 +342,7 @@ void nrLDPC_cnProc_BG1_generator_AVX512(uint16_t Z,int R) ...@@ -342,7 +342,7 @@ void nrLDPC_cnProc_BG1_generator_AVX512(uint16_t Z,int R)
fprintf(fd," min = _mm512_min_epu8(min, maxLLR);\n"); fprintf(fd," min = _mm512_min_epu8(min, maxLLR);\n");
// *p_cnProcBufResBit = _mm512_sign_epi16(min, sgn); // *p_cnProcBufResBit = _mm512_sign_epi16(min, sgn);
// p_cnProcBufResBit++; // p_cnProcBufResBit++;
fprintf(fd," ((__m512i*)cnProcBufRes)[%d+i] = _mm512_sign_epi16(min, sgn);\n",(lut_startAddrCnGroups[3]>>5)+(j*bitOffsetInGroup)); fprintf(fd," ((__m512i*)cnProcBufRes)[%d+i] = _mm512_sign_epi16(min, sgn);\n",(lut_startAddrCnGroups[3]>>31)+(j*bitOffsetInGroup));
fprintf(fd," }\n"); fprintf(fd," }\n");
} }
} }
...@@ -362,10 +362,10 @@ void nrLDPC_cnProc_BG1_generator_AVX512(uint16_t Z,int R) ...@@ -362,10 +362,10 @@ void nrLDPC_cnProc_BG1_generator_AVX512(uint16_t Z,int R)
{ {
// Number of groups of 64 CNs for parallel processing // Number of groups of 64 CNs for parallel processing
// Ceil for values not divisible by 64 // Ceil for values not divisible by 64
M = (lut_numCnInCnGroups[4]*Z + 63)>>5; M = (lut_numCnInCnGroups[4]*Z + 63)>>31;
// Set the offset to each bit within a group in terms of 64 Byte // Set the offset to each bit within a group in terms of 64 Byte
bitOffsetInGroup = (lut_numCnInCnGroups_BG1_R13[4]*NR_LDPC_ZMAX)>>5; bitOffsetInGroup = (lut_numCnInCnGroups_BG1_R13[4]*NR_LDPC_ZMAX)>>31;
// Set pointers to start of group 4 // Set pointers to start of group 4
...@@ -385,7 +385,7 @@ void nrLDPC_cnProc_BG1_generator_AVX512(uint16_t Z,int R) ...@@ -385,7 +385,7 @@ void nrLDPC_cnProc_BG1_generator_AVX512(uint16_t Z,int R)
fprintf(fd," for (int i=0;i<%d;i++) {\n",M); fprintf(fd," for (int i=0;i<%d;i++) {\n",M);
// Abs and sign of 64 CNs (first BN) // Abs and sign of 64 CNs (first BN)
// zmm0 = p_cnProcBuf[lut_idxCnProcG3[j][0] + i]; // zmm0 = p_cnProcBuf[lut_idxCnProcG3[j][0] + i];
fprintf(fd," zmm0 = ((__m512i*)cnProcBuf)[%d+i];\n",(lut_startAddrCnGroups[4]>>5)+lut_idxCnProcG7[j][0]); fprintf(fd," zmm0 = ((__m512i*)cnProcBuf)[%d+i];\n",(lut_startAddrCnGroups[4]>>31)+lut_idxCnProcG7[j][0]);
// sgn = _mm512_sign_epi16(ones, zmm0); // sgn = _mm512_sign_epi16(ones, zmm0);
fprintf(fd," sgn = _mm512_sign_epi16(ones, zmm0);\n"); fprintf(fd," sgn = _mm512_sign_epi16(ones, zmm0);\n");
// min = _mm512_abs_epi8(zmm0); // min = _mm512_abs_epi8(zmm0);
...@@ -395,7 +395,7 @@ void nrLDPC_cnProc_BG1_generator_AVX512(uint16_t Z,int R) ...@@ -395,7 +395,7 @@ void nrLDPC_cnProc_BG1_generator_AVX512(uint16_t Z,int R)
// Loop over BNs // Loop over BNs
for (k=1; k<6; k++) for (k=1; k<6; k++)
{ {
fprintf(fd," zmm0 = ((__m512i*)cnProcBuf)[%d+i];\n",(lut_startAddrCnGroups[4]>>5)+lut_idxCnProcG7[j][k]); fprintf(fd," zmm0 = ((__m512i*)cnProcBuf)[%d+i];\n",(lut_startAddrCnGroups[4]>>31)+lut_idxCnProcG7[j][k]);
// min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); // min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0));
fprintf(fd," min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0));\n"); fprintf(fd," min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0));\n");
...@@ -409,7 +409,7 @@ void nrLDPC_cnProc_BG1_generator_AVX512(uint16_t Z,int R) ...@@ -409,7 +409,7 @@ void nrLDPC_cnProc_BG1_generator_AVX512(uint16_t Z,int R)
fprintf(fd," min = _mm512_min_epu8(min, maxLLR);\n"); fprintf(fd," min = _mm512_min_epu8(min, maxLLR);\n");
// *p_cnProcBufResBit = _mm512_sign_epi16(min, sgn); // *p_cnProcBufResBit = _mm512_sign_epi16(min, sgn);
// p_cnProcBufResBit++; // p_cnProcBufResBit++;
fprintf(fd," ((__m512i*)cnProcBufRes)[%d+i] = _mm512_sign_epi16(min, sgn);\n",(lut_startAddrCnGroups[4]>>5)+(j*bitOffsetInGroup)); fprintf(fd," ((__m512i*)cnProcBufRes)[%d+i] = _mm512_sign_epi16(min, sgn);\n",(lut_startAddrCnGroups[4]>>31)+(j*bitOffsetInGroup));
fprintf(fd," }\n"); fprintf(fd," }\n");
} }
} }
...@@ -430,10 +430,10 @@ void nrLDPC_cnProc_BG1_generator_AVX512(uint16_t Z,int R) ...@@ -430,10 +430,10 @@ void nrLDPC_cnProc_BG1_generator_AVX512(uint16_t Z,int R)
{ {
// Number of groups of 64 CNs for parallel processing // Number of groups of 64 CNs for parallel processing
// Ceil for values not divisible by 64 // Ceil for values not divisible by 64
M = (lut_numCnInCnGroups[5]*Z + 63)>>5; M = (lut_numCnInCnGroups[5]*Z + 63)>>31;
// Set the offset to each bit within a group in terms of 64 Byte // Set the offset to each bit within a group in terms of 64 Byte
bitOffsetInGroup = (lut_numCnInCnGroups_BG1_R13[5]*NR_LDPC_ZMAX)>>5; bitOffsetInGroup = (lut_numCnInCnGroups_BG1_R13[5]*NR_LDPC_ZMAX)>>31;
// Set pointers to start of group 4 // Set pointers to start of group 4
...@@ -453,7 +453,7 @@ void nrLDPC_cnProc_BG1_generator_AVX512(uint16_t Z,int R) ...@@ -453,7 +453,7 @@ void nrLDPC_cnProc_BG1_generator_AVX512(uint16_t Z,int R)
fprintf(fd," for (int i=0;i<%d;i++) {\n",M); fprintf(fd," for (int i=0;i<%d;i++) {\n",M);
// Abs and sign of 64 CNs (first BN) // Abs and sign of 64 CNs (first BN)
// zmm0 = p_cnProcBuf[lut_idxCnProcG3[j][0] + i]; // zmm0 = p_cnProcBuf[lut_idxCnProcG3[j][0] + i];
fprintf(fd," zmm0 = ((__m512i*)cnProcBuf)[%d+i];\n",(lut_startAddrCnGroups[5]>>5)+lut_idxCnProcG8[j][0]); fprintf(fd," zmm0 = ((__m512i*)cnProcBuf)[%d+i];\n",(lut_startAddrCnGroups[5]>>31)+lut_idxCnProcG8[j][0]);
// sgn = _mm512_sign_epi16(ones, zmm0); // sgn = _mm512_sign_epi16(ones, zmm0);
fprintf(fd," sgn = _mm512_sign_epi16(ones, zmm0);\n"); fprintf(fd," sgn = _mm512_sign_epi16(ones, zmm0);\n");
// min = _mm512_abs_epi8(zmm0); // min = _mm512_abs_epi8(zmm0);
...@@ -463,7 +463,7 @@ void nrLDPC_cnProc_BG1_generator_AVX512(uint16_t Z,int R) ...@@ -463,7 +463,7 @@ void nrLDPC_cnProc_BG1_generator_AVX512(uint16_t Z,int R)
// Loop over BNs // Loop over BNs
for (k=1; k<7; k++) for (k=1; k<7; k++)
{ {
fprintf(fd," zmm0 = ((__m512i*)cnProcBuf)[%d+i];\n",(lut_startAddrCnGroups[5]>>5)+lut_idxCnProcG8[j][k]); fprintf(fd," zmm0 = ((__m512i*)cnProcBuf)[%d+i];\n",(lut_startAddrCnGroups[5]>>31)+lut_idxCnProcG8[j][k]);
// min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); // min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0));
fprintf(fd," min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0));\n"); fprintf(fd," min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0));\n");
...@@ -477,7 +477,7 @@ void nrLDPC_cnProc_BG1_generator_AVX512(uint16_t Z,int R) ...@@ -477,7 +477,7 @@ void nrLDPC_cnProc_BG1_generator_AVX512(uint16_t Z,int R)
fprintf(fd," min = _mm512_min_epu8(min, maxLLR);\n"); fprintf(fd," min = _mm512_min_epu8(min, maxLLR);\n");
// *p_cnProcBufResBit = _mm512_sign_epi16(min, sgn); // *p_cnProcBufResBit = _mm512_sign_epi16(min, sgn);
// p_cnProcBufResBit++; // p_cnProcBufResBit++;
fprintf(fd," ((__m512i*)cnProcBufRes)[%d+i] = _mm512_sign_epi16(min, sgn);\n",(lut_startAddrCnGroups[5]>>5)+(j*bitOffsetInGroup)); fprintf(fd," ((__m512i*)cnProcBufRes)[%d+i] = _mm512_sign_epi16(min, sgn);\n",(lut_startAddrCnGroups[5]>>31)+(j*bitOffsetInGroup));
fprintf(fd," }\n"); fprintf(fd," }\n");
} }
} }
...@@ -499,10 +499,10 @@ void nrLDPC_cnProc_BG1_generator_AVX512(uint16_t Z,int R) ...@@ -499,10 +499,10 @@ void nrLDPC_cnProc_BG1_generator_AVX512(uint16_t Z,int R)
{ {
// Number of groups of 64 CNs for parallel processing // Number of groups of 64 CNs for parallel processing
// Ceil for values not divisible by 64 // Ceil for values not divisible by 64
M = (lut_numCnInCnGroups[6]*Z + 63)>>5; M = (lut_numCnInCnGroups[6]*Z + 63)>>31;
// Set the offset to each bit within a group in terms of 64 Byte // Set the offset to each bit within a group in terms of 64 Byte
bitOffsetInGroup = (lut_numCnInCnGroups_BG1_R13[6]*NR_LDPC_ZMAX)>>5; bitOffsetInGroup = (lut_numCnInCnGroups_BG1_R13[6]*NR_LDPC_ZMAX)>>31;
// Set pointers to start of group 9 // Set pointers to start of group 9
...@@ -522,7 +522,7 @@ void nrLDPC_cnProc_BG1_generator_AVX512(uint16_t Z,int R) ...@@ -522,7 +522,7 @@ void nrLDPC_cnProc_BG1_generator_AVX512(uint16_t Z,int R)
fprintf(fd," for (int i=0;i<%d;i++) {\n",M); fprintf(fd," for (int i=0;i<%d;i++) {\n",M);
// Abs and sign of 64 CNs (first BN) // Abs and sign of 64 CNs (first BN)
// zmm0 = p_cnProcBuf[lut_idxCnProcG3[j][0] + i]; // zmm0 = p_cnProcBuf[lut_idxCnProcG3[j][0] + i];
fprintf(fd," zmm0 = ((__m512i*)cnProcBuf)[%d+i];\n",(lut_startAddrCnGroups[6]>>5)+lut_idxCnProcG9[j][0]); fprintf(fd," zmm0 = ((__m512i*)cnProcBuf)[%d+i];\n",(lut_startAddrCnGroups[6]>>31)+lut_idxCnProcG9[j][0]);
// sgn = _mm512_sign_epi16(ones, zmm0); // sgn = _mm512_sign_epi16(ones, zmm0);
fprintf(fd," sgn = _mm512_sign_epi16(ones, zmm0);\n"); fprintf(fd," sgn = _mm512_sign_epi16(ones, zmm0);\n");
// min = _mm512_abs_epi8(zmm0); // min = _mm512_abs_epi8(zmm0);
...@@ -532,7 +532,7 @@ void nrLDPC_cnProc_BG1_generator_AVX512(uint16_t Z,int R) ...@@ -532,7 +532,7 @@ void nrLDPC_cnProc_BG1_generator_AVX512(uint16_t Z,int R)
// Loop over BNs // Loop over BNs
for (k=1; k<8; k++) for (k=1; k<8; k++)
{ {
fprintf(fd," zmm0 = ((__m512i*)cnProcBuf)[%d+i];\n",(lut_startAddrCnGroups[6]>>5)+lut_idxCnProcG9[j][k]); fprintf(fd," zmm0 = ((__m512i*)cnProcBuf)[%d+i];\n",(lut_startAddrCnGroups[6]>>31)+lut_idxCnProcG9[j][k]);
// min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); // min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0));
fprintf(fd," min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0));\n"); fprintf(fd," min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0));\n");
...@@ -546,7 +546,7 @@ void nrLDPC_cnProc_BG1_generator_AVX512(uint16_t Z,int R) ...@@ -546,7 +546,7 @@ void nrLDPC_cnProc_BG1_generator_AVX512(uint16_t Z,int R)
fprintf(fd," min = _mm512_min_epu8(min, maxLLR);\n"); fprintf(fd," min = _mm512_min_epu8(min, maxLLR);\n");
// *p_cnProcBufResBit = _mm512_sign_epi16(min, sgn); // *p_cnProcBufResBit = _mm512_sign_epi16(min, sgn);
// p_cnProcBufResBit++; // p_cnProcBufResBit++;
fprintf(fd," ((__m512i*)cnProcBufRes)[%d+i] = _mm512_sign_epi16(min, sgn);\n",(lut_startAddrCnGroups[6]>>5)+(j*bitOffsetInGroup)); fprintf(fd," ((__m512i*)cnProcBufRes)[%d+i] = _mm512_sign_epi16(min, sgn);\n",(lut_startAddrCnGroups[6]>>31)+(j*bitOffsetInGroup));
fprintf(fd," }\n"); fprintf(fd," }\n");
} }
} }
...@@ -569,10 +569,10 @@ void nrLDPC_cnProc_BG1_generator_AVX512(uint16_t Z,int R) ...@@ -569,10 +569,10 @@ void nrLDPC_cnProc_BG1_generator_AVX512(uint16_t Z,int R)
{ {
// Number of groups of 64 CNs for parallel processing // Number of groups of 64 CNs for parallel processing
// Ceil for values not divisible by 64 // Ceil for values not divisible by 64
M = (lut_numCnInCnGroups[7]*Z + 63)>>5; M = (lut_numCnInCnGroups[7]*Z + 63)>>31;
// Set the offset to each bit within a group in terms of 64 Byte // Set the offset to each bit within a group in terms of 64 Byte
bitOffsetInGroup = (lut_numCnInCnGroups_BG1_R13[7]*NR_LDPC_ZMAX)>>5; bitOffsetInGroup = (lut_numCnInCnGroups_BG1_R13[7]*NR_LDPC_ZMAX)>>31;
// Set pointers to start of group 10 // Set pointers to start of group 10
...@@ -592,7 +592,7 @@ void nrLDPC_cnProc_BG1_generator_AVX512(uint16_t Z,int R) ...@@ -592,7 +592,7 @@ void nrLDPC_cnProc_BG1_generator_AVX512(uint16_t Z,int R)
fprintf(fd," for (int i=0;i<%d;i++) {\n",M); fprintf(fd," for (int i=0;i<%d;i++) {\n",M);
// Abs and sign of 64 CNs (first BN) // Abs and sign of 64 CNs (first BN)
// zmm0 = p_cnProcBuf[lut_idxCnProcG3[j][0] + i]; // zmm0 = p_cnProcBuf[lut_idxCnProcG3[j][0] + i];
fprintf(fd," zmm0 = ((__m512i*)cnProcBuf)[%d+i];\n",(lut_startAddrCnGroups[7]>>5)+lut_idxCnProcG10[j][0]); fprintf(fd," zmm0 = ((__m512i*)cnProcBuf)[%d+i];\n",(lut_startAddrCnGroups[7]>>31)+lut_idxCnProcG10[j][0]);
// sgn = _mm512_sign_epi16(ones, zmm0); // sgn = _mm512_sign_epi16(ones, zmm0);
fprintf(fd," sgn = _mm512_sign_epi16(ones, zmm0);\n"); fprintf(fd," sgn = _mm512_sign_epi16(ones, zmm0);\n");
// min = _mm512_abs_epi8(zmm0); // min = _mm512_abs_epi8(zmm0);
...@@ -602,7 +602,7 @@ void nrLDPC_cnProc_BG1_generator_AVX512(uint16_t Z,int R) ...@@ -602,7 +602,7 @@ void nrLDPC_cnProc_BG1_generator_AVX512(uint16_t Z,int R)
// Loop over BNs // Loop over BNs
for (k=1; k<9; k++) for (k=1; k<9; k++)
{ {
fprintf(fd," zmm0 = ((__m512i*)cnProcBuf)[%d+i];\n",(lut_startAddrCnGroups[7]>>5)+lut_idxCnProcG10[j][k]); fprintf(fd," zmm0 = ((__m512i*)cnProcBuf)[%d+i];\n",(lut_startAddrCnGroups[7]>>31)+lut_idxCnProcG10[j][k]);
// min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); // min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0));
fprintf(fd," min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0));\n"); fprintf(fd," min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0));\n");
...@@ -616,7 +616,7 @@ void nrLDPC_cnProc_BG1_generator_AVX512(uint16_t Z,int R) ...@@ -616,7 +616,7 @@ void nrLDPC_cnProc_BG1_generator_AVX512(uint16_t Z,int R)
fprintf(fd," min = _mm512_min_epu8(min, maxLLR);\n"); fprintf(fd," min = _mm512_min_epu8(min, maxLLR);\n");
// *p_cnProcBufResBit = _mm512_sign_epi16(min, sgn); // *p_cnProcBufResBit = _mm512_sign_epi16(min, sgn);
// p_cnProcBufResBit++; // p_cnProcBufResBit++;
fprintf(fd," ((__m512i*)cnProcBufRes)[%d+i] = _mm512_sign_epi16(min, sgn);\n",(lut_startAddrCnGroups[7]>>5)+(j*bitOffsetInGroup)); fprintf(fd," ((__m512i*)cnProcBufRes)[%d+i] = _mm512_sign_epi16(min, sgn);\n",(lut_startAddrCnGroups[7]>>31)+(j*bitOffsetInGroup));
fprintf(fd," }\n"); fprintf(fd," }\n");
} }
} }
...@@ -642,10 +642,10 @@ void nrLDPC_cnProc_BG1_generator_AVX512(uint16_t Z,int R) ...@@ -642,10 +642,10 @@ void nrLDPC_cnProc_BG1_generator_AVX512(uint16_t Z,int R)
{ {
// Number of groups of 64 CNs for parallel processing // Number of groups of 64 CNs for parallel processing
// Ceil for values not divisible by 64 // Ceil for values not divisible by 64
M = (lut_numCnInCnGroups[8]*Z + 63)>>5; M = (lut_numCnInCnGroups[8]*Z + 63)>>31;
// Set the offset to each bit within a group in terms of 64 Byte // Set the offset to each bit within a group in terms of 64 Byte
bitOffsetInGroup = (lut_numCnInCnGroups_BG1_R13[8]*NR_LDPC_ZMAX)>>5; bitOffsetInGroup = (lut_numCnInCnGroups_BG1_R13[8]*NR_LDPC_ZMAX)>>31;
// Set pointers to start of group 19 // Set pointers to start of group 19
...@@ -665,7 +665,7 @@ void nrLDPC_cnProc_BG1_generator_AVX512(uint16_t Z,int R) ...@@ -665,7 +665,7 @@ void nrLDPC_cnProc_BG1_generator_AVX512(uint16_t Z,int R)
fprintf(fd," for (int i=0;i<%d;i++) {\n",M); fprintf(fd," for (int i=0;i<%d;i++) {\n",M);
// Abs and sign of 64 CNs (first BN) // Abs and sign of 64 CNs (first BN)
// zmm0 = p_cnProcBuf[lut_idxCnProcG3[j][0] + i]; // zmm0 = p_cnProcBuf[lut_idxCnProcG3[j][0] + i];
fprintf(fd," zmm0 = ((__m512i*)cnProcBuf)[%d+i];\n",(lut_startAddrCnGroups[8]>>5)+lut_idxCnProcG19[j][0]); fprintf(fd," zmm0 = ((__m512i*)cnProcBuf)[%d+i];\n",(lut_startAddrCnGroups[8]>>31)+lut_idxCnProcG19[j][0]);
// sgn = _mm512_sign_epi16(ones, zmm0); // sgn = _mm512_sign_epi16(ones, zmm0);
fprintf(fd," sgn = _mm512_sign_epi16(ones, zmm0);\n"); fprintf(fd," sgn = _mm512_sign_epi16(ones, zmm0);\n");
// min = _mm512_abs_epi8(zmm0); // min = _mm512_abs_epi8(zmm0);
...@@ -675,7 +675,7 @@ void nrLDPC_cnProc_BG1_generator_AVX512(uint16_t Z,int R) ...@@ -675,7 +675,7 @@ void nrLDPC_cnProc_BG1_generator_AVX512(uint16_t Z,int R)
// Loop over BNs // Loop over BNs
for (k=1; k<18; k++) for (k=1; k<18; k++)
{ {
fprintf(fd," zmm0 = ((__m512i*)cnProcBuf)[%d+i];\n",(lut_startAddrCnGroups[8]>>5)+lut_idxCnProcG19[j][k]); fprintf(fd," zmm0 = ((__m512i*)cnProcBuf)[%d+i];\n",(lut_startAddrCnGroups[8]>>31)+lut_idxCnProcG19[j][k]);
// min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); // min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0));
fprintf(fd," min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0));\n"); fprintf(fd," min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0));\n");
...@@ -689,7 +689,7 @@ void nrLDPC_cnProc_BG1_generator_AVX512(uint16_t Z,int R) ...@@ -689,7 +689,7 @@ void nrLDPC_cnProc_BG1_generator_AVX512(uint16_t Z,int R)
fprintf(fd," min = _mm512_min_epu8(min, maxLLR);\n"); fprintf(fd," min = _mm512_min_epu8(min, maxLLR);\n");
// *p_cnProcBufResBit = _mm512_sign_epi16(min, sgn); // *p_cnProcBufResBit = _mm512_sign_epi16(min, sgn);
// p_cnProcBufResBit++; // p_cnProcBufResBit++;
fprintf(fd," ((__m512i*)cnProcBufRes)[%d+i] = _mm512_sign_epi16(min, sgn);\n",(lut_startAddrCnGroups[8]>>5)+(j*bitOffsetInGroup)); fprintf(fd," ((__m512i*)cnProcBufRes)[%d+i] = _mm512_sign_epi16(min, sgn);\n",(lut_startAddrCnGroups[8]>>31)+(j*bitOffsetInGroup));
fprintf(fd," }\n"); fprintf(fd," }\n");
} }
} }
......
Markdown is supported
0%
or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment