Commit 265b5c4d authored by Sy's avatar Sy

Test for AVX512

parent 4542322b
......@@ -20,7 +20,7 @@ void nrLDPC_cnProc_BG1_generator_AVX512(uint16_t Z,int R)
fprintf(fd,"#include <stdint.h>\n");
fprintf(fd,"#include <immintrin.h>\n");
fprintf(fd, "#include \"../include/avx512fintrin.h\"\n");
fprintf(fd, "__m512i _mm512_sign_epi16(__m512i a, __m512i b){ \n"); /* Emulate _mm512_sign_epi16() with instructions that exist in the AVX-512 instruction set */
fprintf(fd, "b = _mm512_min_epi16(b, _mm512_set1_epi16(1)); \n" );
......@@ -69,8 +69,8 @@ void nrLDPC_cnProc_BG1_generator_AVX512(uint16_t Z,int R)
// Offsets are in units of bitOffsetInGroup (1*384/64)=6
// Offsets are in units of bitOffsetInGroup (1*384/64)=6
// const uint8_t lut_idxCnProcG3[3][2] = {{6,12}, {0,12}, {0,6}};
const uint8_t lut_idxCnProcG3[3][2] = {{12,24}, {0,24}, {0,12}};
const uint8_t lut_idxCnProcG3[3][2] = {{6,12}, {0,12}, {0,6}};
fprintf(fd," __m512i zmm0, min, sgn,ones,maxLLR;\n");
fprintf(fd," ones = _mm512_set1_epi8((char)1);\n");
......@@ -80,17 +80,10 @@ void nrLDPC_cnProc_BG1_generator_AVX512(uint16_t Z,int R)
{
// Number of groups of 64 CNs for parallel processing
// Ceil for values not divisible by 64
<<<<<<< HEAD
M = (lut_numCnInCnGroups[0]*Z + 63)>>31;
// Set the offset to each bit within a group in terms of 64 Byte
bitOffsetInGroup = (lut_numCnInCnGroups_BG1_R13[0]*NR_LDPC_ZMAX)>>31;
=======
M = (lut_numCnInCnGroups[0]*Z + 31)>>7;
M = (lut_numCnInCnGroups[0]*Z + 63)>>5;
// Set the offset to each bit within a group in terms of 64 Byte
bitOffsetInGroup = (lut_numCnInCnGroups_BG1_R13[0]*NR_LDPC_ZMAX)>>7;
>>>>>>> 6d9ceaa712033dc4f28050886be43572df7c2b68
bitOffsetInGroup = (lut_numCnInCnGroups_BG1_R13[0]*NR_LDPC_ZMAX)>>5;
// Set pointers to start of group 3
......@@ -111,11 +104,7 @@ void nrLDPC_cnProc_BG1_generator_AVX512(uint16_t Z,int R)
fprintf(fd," for (int i=0;i<%d;i+=2) {\n",M);
// Abs and sign of 64 CNs (first BN)
// zmm0 = p_cnProcBuf[lut_idxCnProcG3[j][0] + i];
<<<<<<< HEAD
fprintf(fd," zmm0 = ((__m512i*)cnProcBuf)[%d+i];\n",(lut_startAddrCnGroups[0]>>31)+lut_idxCnProcG3[j][0]);
=======
fprintf(fd," zmm0 = ((__m512i*)cnProcBuf)[%d+i];\n",(lut_startAddrCnGroups[0]>>7)+lut_idxCnProcG3[j][0]);
>>>>>>> 6d9ceaa712033dc4f28050886be43572df7c2b68
fprintf(fd," zmm0 = ((__m512i*)cnProcBuf)[%d+i];\n",(lut_startAddrCnGroups[0]>>5)+lut_idxCnProcG3[j][0]);
// sgn = _mm512_sign_epi16(ones, zmm0);
fprintf(fd," sgn = _mm512_sign_epi16(ones, zmm0);\n");
// min = _mm512_abs_epi8(zmm0);
......@@ -123,11 +112,7 @@ void nrLDPC_cnProc_BG1_generator_AVX512(uint16_t Z,int R)
// 32 CNs of second BN
// zmm0 = p_cnProcBuf[lut_idxCnProcG3[j][1] + i];
<<<<<<< HEAD
fprintf(fd," zmm0 = ((__m512i*)cnProcBuf)[%d+i];\n",(lut_startAddrCnGroups[0]>>31)+lut_idxCnProcG3[j][1]);
=======
fprintf(fd," zmm0 = ((__m512i*)cnProcBuf)[%d+i];\n",(lut_startAddrCnGroups[0]>>7)+lut_idxCnProcG3[j][1]);
>>>>>>> 6d9ceaa712033dc4f28050886be43572df7c2b68
fprintf(fd," zmm0 = ((__m512i*)cnProcBuf)[%d+i];\n",(lut_startAddrCnGroups[0]>>5)+lut_idxCnProcG3[j][1]);
// min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0));
fprintf(fd," min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0));\n");
......@@ -140,19 +125,11 @@ void nrLDPC_cnProc_BG1_generator_AVX512(uint16_t Z,int R)
fprintf(fd," min = _mm512_min_epu8(min, maxLLR);\n");
// *p_cnProcBufResBit = _mm512_sign_epi16(min, sgn);
// p_cnProcBufResBit++;
<<<<<<< HEAD
fprintf(fd," ((__m512i*)cnProcBufRes)[%d+i] = _mm512_sign_epi16(min, sgn);\n",(lut_startAddrCnGroups[0]>>31)+(j*bitOffsetInGroup));
fprintf(fd," ((__m512i*)cnProcBufRes)[%d+i] = _mm512_sign_epi16(min, sgn);\n",(lut_startAddrCnGroups[0]>>5)+(j*bitOffsetInGroup));
// Abs and sign of 64 CNs (first BN)
// zmm0 = p_cnProcBuf[lut_idxCnProcG3[j][0] + i];
fprintf(fd," zmm0 = ((__m512i*)cnProcBuf)[%d+i];\n",(lut_startAddrCnGroups[0]>>31)+lut_idxCnProcG3[j][0]+1);
=======
fprintf(fd," ((__m512i*)cnProcBufRes)[%d+i] = _mm512_sign_epi16(min, sgn);\n",(lut_startAddrCnGroups[0]>>7)+(j*bitOffsetInGroup));
// Abs and sign of 64 CNs (first BN)
// zmm0 = p_cnProcBuf[lut_idxCnProcG3[j][0] + i];
fprintf(fd," zmm0 = ((__m512i*)cnProcBuf)[%d+i];\n",(lut_startAddrCnGroups[0]>>7)+lut_idxCnProcG3[j][0]+1);
>>>>>>> 6d9ceaa712033dc4f28050886be43572df7c2b68
fprintf(fd," zmm0 = ((__m512i*)cnProcBuf)[%d+i];\n",(lut_startAddrCnGroups[0]>>5)+lut_idxCnProcG3[j][0]+1);
// sgn = _mm512_sign_epi16(ones, zmm0);
fprintf(fd," sgn = _mm512_sign_epi16(ones, zmm0);\n");
// min = _mm512_abs_epi8(zmm0);
......@@ -160,11 +137,7 @@ void nrLDPC_cnProc_BG1_generator_AVX512(uint16_t Z,int R)
// 32 CNs of second BN
// zmm0 = p_cnProcBuf[lut_idxCnProcG3[j][1] + i];
<<<<<<< HEAD
fprintf(fd," zmm0 = ((__m512i*)cnProcBuf)[%d+i];\n",(lut_startAddrCnGroups[0]>>31)+lut_idxCnProcG3[j][1]+1);
=======
fprintf(fd," zmm0 = ((__m512i*)cnProcBuf)[%d+i];\n",(lut_startAddrCnGroups[0]>>7)+lut_idxCnProcG3[j][1]+1);
>>>>>>> 6d9ceaa712033dc4f28050886be43572df7c2b68
fprintf(fd," zmm0 = ((__m512i*)cnProcBuf)[%d+i];\n",(lut_startAddrCnGroups[0]>>5)+lut_idxCnProcG3[j][1]+1);
// min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0));
fprintf(fd," min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0));\n");
......@@ -177,11 +150,7 @@ void nrLDPC_cnProc_BG1_generator_AVX512(uint16_t Z,int R)
fprintf(fd," min = _mm512_min_epu8(min, maxLLR);\n");
// *p_cnProcBufResBit = _mm512_sign_epi16(min, sgn);
// p_cnProcBufResBit++;
<<<<<<< HEAD
fprintf(fd," ((__m512i*)cnProcBufRes)[%d+i] = _mm512_sign_epi16(min, sgn);\n",(lut_startAddrCnGroups[0]>>31)+(j*bitOffsetInGroup)+1);
=======
fprintf(fd," ((__m512i*)cnProcBufRes)[%d+i] = _mm512_sign_epi16(min, sgn);\n",(lut_startAddrCnGroups[0]>>7)+(j*bitOffsetInGroup)+1);
>>>>>>> 6d9ceaa712033dc4f28050886be43572df7c2b68
fprintf(fd," ((__m512i*)cnProcBufRes)[%d+i] = _mm512_sign_epi16(min, sgn);\n",(lut_startAddrCnGroups[0]>>5)+(j*bitOffsetInGroup)+1);
fprintf(fd," }\n");
}
......@@ -191,24 +160,16 @@ void nrLDPC_cnProc_BG1_generator_AVX512(uint16_t Z,int R)
// Process group with 4 BNs
fprintf(fd,"//Process group with 4 BNs\n");
// Offset is 5*384/64 = 30
// const uint8_t lut_idxCnProcG4[4][3] = {{30,60,90}, {0,60,90}, {0,30,90}, {0,30,60}};
const uint8_t lut_idxCnProcG4[4][3] = {{60,120,180}, {0,120,180}, {0,60,180}, {0,60,120}};
const uint8_t lut_idxCnProcG4[4][3] = {{30,60,90}, {0,60,90}, {0,30,90}, {0,30,60}};
if (lut_numCnInCnGroups[1] > 0)
{
// Number of groups of 64 CNs for parallel processing
// Ceil for values not divisible by 64
<<<<<<< HEAD
M = (lut_numCnInCnGroups[1]*Z + 63)>>31;
M = (lut_numCnInCnGroups[1]*Z + 63)>>5;
// Set the offset to each bit within a group in terms of 64 Byte
bitOffsetInGroup = (lut_numCnInCnGroups_BG1_R13[1]*NR_LDPC_ZMAX)>>31;
=======
M = (lut_numCnInCnGroups[1]*Z + 31)>>7;
// Set the offset to each bit within a group in terms of 64 Byte
bitOffsetInGroup = (lut_numCnInCnGroups_BG1_R13[1]*NR_LDPC_ZMAX)>>7;
>>>>>>> 6d9ceaa712033dc4f28050886be43572df7c2b68
bitOffsetInGroup = (lut_numCnInCnGroups_BG1_R13[1]*NR_LDPC_ZMAX)>>5;
// Set pointers to start of group 4
......@@ -227,11 +188,7 @@ void nrLDPC_cnProc_BG1_generator_AVX512(uint16_t Z,int R)
fprintf(fd," for (int i=0;i<%d;i++) {\n",M);
// Abs and sign of 64 CNs (first BN)
// zmm0 = p_cnProcBuf[lut_idxCnProcG3[j][0] + i];
<<<<<<< HEAD
fprintf(fd," zmm0 = ((__m512i*)cnProcBuf)[%d+i];\n",(lut_startAddrCnGroups[1]>>31)+lut_idxCnProcG4[j][0]);
=======
fprintf(fd," zmm0 = ((__m512i*)cnProcBuf)[%d+i];\n",(lut_startAddrCnGroups[1]>>7)+lut_idxCnProcG4[j][0]);
>>>>>>> 6d9ceaa712033dc4f28050886be43572df7c2b68
fprintf(fd," zmm0 = ((__m512i*)cnProcBuf)[%d+i];\n",(lut_startAddrCnGroups[1]>>5)+lut_idxCnProcG4[j][0]);
// sgn = _mm512_sign_epi16(ones, zmm0);
fprintf(fd," sgn = _mm512_sign_epi16(ones, zmm0);\n");
// min = _mm512_abs_epi8(zmm0);
......@@ -241,11 +198,7 @@ void nrLDPC_cnProc_BG1_generator_AVX512(uint16_t Z,int R)
// Loop over BNs
for (k=1; k<3; k++)
{
<<<<<<< HEAD
fprintf(fd," zmm0 = ((__m512i*)cnProcBuf)[%d+i];\n",(lut_startAddrCnGroups[1]>>31)+lut_idxCnProcG4[j][k]);
=======
fprintf(fd," zmm0 = ((__m512i*)cnProcBuf)[%d+i];\n",(lut_startAddrCnGroups[1]>>7)+lut_idxCnProcG4[j][k]);
>>>>>>> 6d9ceaa712033dc4f28050886be43572df7c2b68
fprintf(fd," zmm0 = ((__m512i*)cnProcBuf)[%d+i];\n",(lut_startAddrCnGroups[1]>>5)+lut_idxCnProcG4[j][k]);
// min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0));
fprintf(fd," min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0));\n");
......@@ -259,11 +212,7 @@ void nrLDPC_cnProc_BG1_generator_AVX512(uint16_t Z,int R)
fprintf(fd," min = _mm512_min_epu8(min, maxLLR);\n");
// *p_cnProcBufResBit = _mm512_sign_epi16(min, sgn);
// p_cnProcBufResBit++;
<<<<<<< HEAD
fprintf(fd," ((__m512i*)cnProcBufRes)[%d+i] = _mm512_sign_epi16(min, sgn);\n",(lut_startAddrCnGroups[1]>>31)+(j*bitOffsetInGroup));
=======
fprintf(fd," ((__m512i*)cnProcBufRes)[%d+i] = _mm512_sign_epi16(min, sgn);\n",(lut_startAddrCnGroups[1]>>7)+(j*bitOffsetInGroup));
>>>>>>> 6d9ceaa712033dc4f28050886be43572df7c2b68
fprintf(fd," ((__m512i*)cnProcBufRes)[%d+i] = _mm512_sign_epi16(min, sgn);\n",(lut_startAddrCnGroups[1]>>5)+(j*bitOffsetInGroup));
fprintf(fd," }\n");
}
}
......@@ -273,28 +222,18 @@ void nrLDPC_cnProc_BG1_generator_AVX512(uint16_t Z,int R)
// Process group with 5 BNs
fprintf(fd,"//Process group with 5 BNs\n");
// Offset is 18*384/64 = 216
//const uint16_t lut_idxCnProcG5[5][4] = {{108,216,324,432}, {0,216,324,432},
// {0,108,324,432}, {0,108,216,432}, {0,108,216,324}};
const uint16_t lut_idxCnProcG5[5][4] = {{108,216,324,432}, {0,216,324,432},
{0,108,324,432}, {0,108,216,432}, {0,108,216,324}};
const uint16_t lut_idxCnProcG5[5][4] = {{216,432,648,864}, {0,432,648,864},
{0,216,648,864}, {0,216,432,864}, {0,216,432,648}};
if (lut_numCnInCnGroups[2] > 0)
{
// Number of groups of 64 CNs for parallel processing
// Ceil for values not divisible by 64
<<<<<<< HEAD
M = (lut_numCnInCnGroups[2]*Z + 63)>>31;
// Set the offset to each bit within a group in terms of 64 Byte
bitOffsetInGroup = (lut_numCnInCnGroups_BG1_R13[2]*NR_LDPC_ZMAX)>>31;
=======
M = (lut_numCnInCnGroups[2]*Z + 31)>>7;
M = (lut_numCnInCnGroups[2]*Z + 63)>>5;
// Set the offset to each bit within a group in terms of 64 Byte
bitOffsetInGroup = (lut_numCnInCnGroups_BG1_R13[2]*NR_LDPC_ZMAX)>>7;
>>>>>>> 6d9ceaa712033dc4f28050886be43572df7c2b68
bitOffsetInGroup = (lut_numCnInCnGroups_BG1_R13[2]*NR_LDPC_ZMAX)>>5;
// Set pointers to start of group 4
......@@ -314,11 +253,7 @@ void nrLDPC_cnProc_BG1_generator_AVX512(uint16_t Z,int R)
fprintf(fd," for (int i=0;i<%d;i++) {\n",M);
// Abs and sign of 64 CNs (first BN)
// zmm0 = p_cnProcBuf[lut_idxCnProcG3[j][0] + i];
<<<<<<< HEAD
fprintf(fd," zmm0 = ((__m512i*)cnProcBuf)[%d+i];\n",(lut_startAddrCnGroups[2]>>31)+lut_idxCnProcG5[j][0]);
=======
fprintf(fd," zmm0 = ((__m512i*)cnProcBuf)[%d+i];\n",(lut_startAddrCnGroups[2]>>7)+lut_idxCnProcG5[j][0]);
>>>>>>> 6d9ceaa712033dc4f28050886be43572df7c2b68
fprintf(fd," zmm0 = ((__m512i*)cnProcBuf)[%d+i];\n",(lut_startAddrCnGroups[2]>>5)+lut_idxCnProcG5[j][0]);
// sgn = _mm512_sign_epi16(ones, zmm0);
fprintf(fd," sgn = _mm512_sign_epi16(ones, zmm0);\n");
// min = _mm512_abs_epi8(zmm0);
......@@ -328,11 +263,7 @@ void nrLDPC_cnProc_BG1_generator_AVX512(uint16_t Z,int R)
// Loop over BNs
for (k=1; k<4; k++)
{
<<<<<<< HEAD
fprintf(fd," zmm0 = ((__m512i*)cnProcBuf)[%d+i];\n",(lut_startAddrCnGroups[2]>>31)+lut_idxCnProcG5[j][k]);
=======
fprintf(fd," zmm0 = ((__m512i*)cnProcBuf)[%d+i];\n",(lut_startAddrCnGroups[2]>>7)+lut_idxCnProcG5[j][k]);
>>>>>>> 6d9ceaa712033dc4f28050886be43572df7c2b68
fprintf(fd," zmm0 = ((__m512i*)cnProcBuf)[%d+i];\n",(lut_startAddrCnGroups[2]>>5)+lut_idxCnProcG5[j][k]);
// min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0));
fprintf(fd," min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0));\n");
......@@ -346,11 +277,7 @@ void nrLDPC_cnProc_BG1_generator_AVX512(uint16_t Z,int R)
fprintf(fd," min = _mm512_min_epu8(min, maxLLR);\n");
// *p_cnProcBufResBit = _mm512_sign_epi16(min, sgn);
// p_cnProcBufResBit++;
<<<<<<< HEAD
fprintf(fd," ((__m512i*)cnProcBufRes)[%d+i] = _mm512_sign_epi16(min, sgn);\n",(lut_startAddrCnGroups[2]>>31)+(j*bitOffsetInGroup));
=======
fprintf(fd," ((__m512i*)cnProcBufRes)[%d+i] = _mm512_sign_epi16(min, sgn);\n",(lut_startAddrCnGroups[2]>>7)+(j*bitOffsetInGroup));
>>>>>>> 6d9ceaa712033dc4f28050886be43572df7c2b68
fprintf(fd," ((__m512i*)cnProcBufRes)[%d+i] = _mm512_sign_epi16(min, sgn);\n",(lut_startAddrCnGroups[2]>>5)+(j*bitOffsetInGroup));
fprintf(fd," }\n");
}
}
......@@ -359,28 +286,19 @@ void nrLDPC_cnProc_BG1_generator_AVX512(uint16_t Z,int R)
// Process group with 6 BNs
fprintf(fd,"//Process group with 6 BNs\n");
// Offset is 8*384/64 = 48
/* const uint16_t lut_idxCnProcG6[6][5] = {{48,96,144,192,240}, {0,96,144,192,240},
const uint16_t lut_idxCnProcG6[6][5] = {{48,96,144,192,240}, {0,96,144,192,240},
{0,48,144,192,240}, {0,48,96,192,240},
{0,48,96,144,240}, {0,48,96,144,192}};*/
{0,48,96,144,240}, {0,48,96,144,192}};
const uint16_t lut_idxCnProcG6[6][5] = {{96,192,288,384,480}, {0,192,288,384,480},
{0,96,288,384,480}, {0,96,192,384,480},
{0,96,192,288,480}, {0,96,192,288,384}};
if (lut_numCnInCnGroups[3] > 0)
{
// Number of groups of 64 CNs for parallel processing
// Ceil for values not divisible by 64
<<<<<<< HEAD
M = (lut_numCnInCnGroups[3]*Z + 63)>>31;
// Set the offset to each bit within a group in terms of 64 Byte
bitOffsetInGroup = (lut_numCnInCnGroups_BG1_R13[3]*NR_LDPC_ZMAX)>>31;
=======
M = (lut_numCnInCnGroups[3]*Z + 31)>>7;
M = (lut_numCnInCnGroups[3]*Z + 63)>>5;
// Set the offset to each bit within a group in terms of 64 Byte
bitOffsetInGroup = (lut_numCnInCnGroups_BG1_R13[3]*NR_LDPC_ZMAX)>>7;
>>>>>>> 6d9ceaa712033dc4f28050886be43572df7c2b68
bitOffsetInGroup = (lut_numCnInCnGroups_BG1_R13[3]*NR_LDPC_ZMAX)>>5;
// Set pointers to start of group 4
......@@ -400,11 +318,7 @@ void nrLDPC_cnProc_BG1_generator_AVX512(uint16_t Z,int R)
fprintf(fd," for (int i=0;i<%d;i++) {\n",M);
// Abs and sign of 64 CNs (first BN)
// zmm0 = p_cnProcBuf[lut_idxCnProcG3[j][0] + i];
<<<<<<< HEAD
fprintf(fd," zmm0 = ((__m512i*)cnProcBuf)[%d+i];\n",(lut_startAddrCnGroups[3]>>31)+lut_idxCnProcG6[j][0]);
=======
fprintf(fd," zmm0 = ((__m512i*)cnProcBuf)[%d+i];\n",(lut_startAddrCnGroups[3]>>7)+lut_idxCnProcG6[j][0]);
>>>>>>> 6d9ceaa712033dc4f28050886be43572df7c2b68
fprintf(fd," zmm0 = ((__m512i*)cnProcBuf)[%d+i];\n",(lut_startAddrCnGroups[3]>>5)+lut_idxCnProcG6[j][0]);
// sgn = _mm512_sign_epi16(ones, zmm0);
fprintf(fd," sgn = _mm512_sign_epi16(ones, zmm0);\n");
// min = _mm512_abs_epi8(zmm0);
......@@ -414,11 +328,7 @@ void nrLDPC_cnProc_BG1_generator_AVX512(uint16_t Z,int R)
// Loop over BNs
for (k=1; k<5; k++)
{
<<<<<<< HEAD
fprintf(fd," zmm0 = ((__m512i*)cnProcBuf)[%d+i];\n",(lut_startAddrCnGroups[3]>>31)+lut_idxCnProcG6[j][k]);
=======
fprintf(fd," zmm0 = ((__m512i*)cnProcBuf)[%d+i];\n",(lut_startAddrCnGroups[3]>>7)+lut_idxCnProcG6[j][k]);
>>>>>>> 6d9ceaa712033dc4f28050886be43572df7c2b68
fprintf(fd," zmm0 = ((__m512i*)cnProcBuf)[%d+i];\n",(lut_startAddrCnGroups[3]>>5)+lut_idxCnProcG6[j][k]);
// min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0));
fprintf(fd," min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0));\n");
......@@ -432,11 +342,7 @@ void nrLDPC_cnProc_BG1_generator_AVX512(uint16_t Z,int R)
fprintf(fd," min = _mm512_min_epu8(min, maxLLR);\n");
// *p_cnProcBufResBit = _mm512_sign_epi16(min, sgn);
// p_cnProcBufResBit++;
<<<<<<< HEAD
fprintf(fd," ((__m512i*)cnProcBufRes)[%d+i] = _mm512_sign_epi16(min, sgn);\n",(lut_startAddrCnGroups[3]>>31)+(j*bitOffsetInGroup));
=======
fprintf(fd," ((__m512i*)cnProcBufRes)[%d+i] = _mm512_sign_epi16(min, sgn);\n",(lut_startAddrCnGroups[3]>>7)+(j*bitOffsetInGroup));
>>>>>>> 6d9ceaa712033dc4f28050886be43572df7c2b68
fprintf(fd," ((__m512i*)cnProcBufRes)[%d+i] = _mm512_sign_epi16(min, sgn);\n",(lut_startAddrCnGroups[3]>>5)+(j*bitOffsetInGroup));
fprintf(fd," }\n");
}
}
......@@ -446,32 +352,20 @@ void nrLDPC_cnProc_BG1_generator_AVX512(uint16_t Z,int R)
// Process group with 7 BNs
fprintf(fd,"//Process group with 7 BNs\n");
// Offset is 5*384/64 = 30
/* const uint16_t lut_idxCnProcG7[7][6] = {{30,60,90,120,150,180}, {0,60,90,120,150,180},
const uint16_t lut_idxCnProcG7[7][6] = {{30,60,90,120,150,180}, {0,60,90,120,150,180},
{0,30,90,120,150,180}, {0,30,60,120,150,180},
{0,30,60,90,150,180}, {0,30,60,90,120,180},
{0,30,60,90,120,150}};*/
{0,30,60,90,120,150}};
const uint16_t lut_idxCnProcG7[7][6] = {{60,120,180,240,300,360}, {0,120,180,240,300,360},
{0,60,180,240,300,360}, {0,60,120,240,300,360},
{0,60,120,180,300,360}, {0,60,120,180,240,360},
{0,60,120,180,240,300}};
if (lut_numCnInCnGroups[4] > 0)
{
// Number of groups of 64 CNs for parallel processing
// Ceil for values not divisible by 64
<<<<<<< HEAD
M = (lut_numCnInCnGroups[4]*Z + 63)>>31;
// Set the offset to each bit within a group in terms of 64 Byte
bitOffsetInGroup = (lut_numCnInCnGroups_BG1_R13[4]*NR_LDPC_ZMAX)>>31;
=======
M = (lut_numCnInCnGroups[4]*Z + 31)>>7;
M = (lut_numCnInCnGroups[4]*Z + 63)>>5;
// Set the offset to each bit within a group in terms of 64 Byte
bitOffsetInGroup = (lut_numCnInCnGroups_BG1_R13[4]*NR_LDPC_ZMAX)>>7;
>>>>>>> 6d9ceaa712033dc4f28050886be43572df7c2b68
bitOffsetInGroup = (lut_numCnInCnGroups_BG1_R13[4]*NR_LDPC_ZMAX)>>5;
// Set pointers to start of group 4
......@@ -491,11 +385,7 @@ void nrLDPC_cnProc_BG1_generator_AVX512(uint16_t Z,int R)
fprintf(fd," for (int i=0;i<%d;i++) {\n",M);
// Abs and sign of 64 CNs (first BN)
// zmm0 = p_cnProcBuf[lut_idxCnProcG3[j][0] + i];
<<<<<<< HEAD
fprintf(fd," zmm0 = ((__m512i*)cnProcBuf)[%d+i];\n",(lut_startAddrCnGroups[4]>>31)+lut_idxCnProcG7[j][0]);
=======
fprintf(fd," zmm0 = ((__m512i*)cnProcBuf)[%d+i];\n",(lut_startAddrCnGroups[4]>>7)+lut_idxCnProcG7[j][0]);
>>>>>>> 6d9ceaa712033dc4f28050886be43572df7c2b68
fprintf(fd," zmm0 = ((__m512i*)cnProcBuf)[%d+i];\n",(lut_startAddrCnGroups[4]>>5)+lut_idxCnProcG7[j][0]);
// sgn = _mm512_sign_epi16(ones, zmm0);
fprintf(fd," sgn = _mm512_sign_epi16(ones, zmm0);\n");
// min = _mm512_abs_epi8(zmm0);
......@@ -505,11 +395,7 @@ void nrLDPC_cnProc_BG1_generator_AVX512(uint16_t Z,int R)
// Loop over BNs
for (k=1; k<6; k++)
{
<<<<<<< HEAD
fprintf(fd," zmm0 = ((__m512i*)cnProcBuf)[%d+i];\n",(lut_startAddrCnGroups[4]>>31)+lut_idxCnProcG7[j][k]);
=======
fprintf(fd," zmm0 = ((__m512i*)cnProcBuf)[%d+i];\n",(lut_startAddrCnGroups[4]>>7)+lut_idxCnProcG7[j][k]);
>>>>>>> 6d9ceaa712033dc4f28050886be43572df7c2b68
fprintf(fd," zmm0 = ((__m512i*)cnProcBuf)[%d+i];\n",(lut_startAddrCnGroups[4]>>5)+lut_idxCnProcG7[j][k]);
// min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0));
fprintf(fd," min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0));\n");
......@@ -523,11 +409,7 @@ void nrLDPC_cnProc_BG1_generator_AVX512(uint16_t Z,int R)
fprintf(fd," min = _mm512_min_epu8(min, maxLLR);\n");
// *p_cnProcBufResBit = _mm512_sign_epi16(min, sgn);
// p_cnProcBufResBit++;
<<<<<<< HEAD
fprintf(fd," ((__m512i*)cnProcBufRes)[%d+i] = _mm512_sign_epi16(min, sgn);\n",(lut_startAddrCnGroups[4]>>31)+(j*bitOffsetInGroup));
=======
fprintf(fd," ((__m512i*)cnProcBufRes)[%d+i] = _mm512_sign_epi16(min, sgn);\n",(lut_startAddrCnGroups[4]>>7)+(j*bitOffsetInGroup));
>>>>>>> 6d9ceaa712033dc4f28050886be43572df7c2b68
fprintf(fd," ((__m512i*)cnProcBufRes)[%d+i] = _mm512_sign_epi16(min, sgn);\n",(lut_startAddrCnGroups[4]>>5)+(j*bitOffsetInGroup));
fprintf(fd," }\n");
}
}
......@@ -537,15 +419,10 @@ void nrLDPC_cnProc_BG1_generator_AVX512(uint16_t Z,int R)
// Process group with 8 BNs
fprintf(fd,"//Process group with 8 BNs\n");
// Offset is 2*384/64 = 12
/* const uint8_t lut_idxCnProcG8[8][7] = {{12,24,36,48,56,72,84}, {0,24,36,48,56,72,84},
const uint8_t lut_idxCnProcG8[8][7] = {{12,24,36,48,56,72,84}, {0,24,36,48,56,72,84},
{0,12,36,48,56,72,84}, {0,12,24,48,56,72,84},
{0,12,24,36,56,72,84}, {0,12,24,36,48,72,84},
{0,12,24,36,48,56,84}, {0,12,24,36,48,120,72}};*/
const uint8_t lut_idxCnProcG8[8][7] = {{24,48,72,96,120,144,168}, {0,48,72,96,120,144,168},
{0,24,72,96,120,144,168}, {0,24,48,96,120,144,168},
{0,24,48,72,120,144,168}, {0,24,48,72,96,144,168},
{0,24,48,72,96,120,168}, {0,24,48,72,96,120,144}};
{0,12,24,36,48,56,84}, {0,12,24,36,48,120,72}};
......@@ -553,17 +430,10 @@ void nrLDPC_cnProc_BG1_generator_AVX512(uint16_t Z,int R)
{
// Number of groups of 64 CNs for parallel processing
// Ceil for values not divisible by 64
<<<<<<< HEAD
M = (lut_numCnInCnGroups[5]*Z + 63)>>31;
M = (lut_numCnInCnGroups[5]*Z + 63)>>5;
// Set the offset to each bit within a group in terms of 64 Byte
bitOffsetInGroup = (lut_numCnInCnGroups_BG1_R13[5]*NR_LDPC_ZMAX)>>31;
=======
M = (lut_numCnInCnGroups[5]*Z + 31)>>7;
// Set the offset to each bit within a group in terms of 64 Byte
bitOffsetInGroup = (lut_numCnInCnGroups_BG1_R13[5]*NR_LDPC_ZMAX)>>7;
>>>>>>> 6d9ceaa712033dc4f28050886be43572df7c2b68
bitOffsetInGroup = (lut_numCnInCnGroups_BG1_R13[5]*NR_LDPC_ZMAX)>>5;
// Set pointers to start of group 4
......@@ -583,11 +453,7 @@ void nrLDPC_cnProc_BG1_generator_AVX512(uint16_t Z,int R)
fprintf(fd," for (int i=0;i<%d;i++) {\n",M);
// Abs and sign of 64 CNs (first BN)
// zmm0 = p_cnProcBuf[lut_idxCnProcG3[j][0] + i];
<<<<<<< HEAD
fprintf(fd," zmm0 = ((__m512i*)cnProcBuf)[%d+i];\n",(lut_startAddrCnGroups[5]>>31)+lut_idxCnProcG8[j][0]);
=======
fprintf(fd," zmm0 = ((__m512i*)cnProcBuf)[%d+i];\n",(lut_startAddrCnGroups[5]>>7)+lut_idxCnProcG8[j][0]);
>>>>>>> 6d9ceaa712033dc4f28050886be43572df7c2b68
fprintf(fd," zmm0 = ((__m512i*)cnProcBuf)[%d+i];\n",(lut_startAddrCnGroups[5]>>5)+lut_idxCnProcG8[j][0]);
// sgn = _mm512_sign_epi16(ones, zmm0);
fprintf(fd," sgn = _mm512_sign_epi16(ones, zmm0);\n");
// min = _mm512_abs_epi8(zmm0);
......@@ -597,11 +463,7 @@ void nrLDPC_cnProc_BG1_generator_AVX512(uint16_t Z,int R)
// Loop over BNs
for (k=1; k<7; k++)
{
<<<<<<< HEAD
fprintf(fd," zmm0 = ((__m512i*)cnProcBuf)[%d+i];\n",(lut_startAddrCnGroups[5]>>31)+lut_idxCnProcG8[j][k]);
=======
fprintf(fd," zmm0 = ((__m512i*)cnProcBuf)[%d+i];\n",(lut_startAddrCnGroups[5]>>7)+lut_idxCnProcG8[j][k]);
>>>>>>> 6d9ceaa712033dc4f28050886be43572df7c2b68
fprintf(fd," zmm0 = ((__m512i*)cnProcBuf)[%d+i];\n",(lut_startAddrCnGroups[5]>>5)+lut_idxCnProcG8[j][k]);
// min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0));
fprintf(fd," min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0));\n");
......@@ -615,11 +477,7 @@ void nrLDPC_cnProc_BG1_generator_AVX512(uint16_t Z,int R)
fprintf(fd," min = _mm512_min_epu8(min, maxLLR);\n");
// *p_cnProcBufResBit = _mm512_sign_epi16(min, sgn);
// p_cnProcBufResBit++;
<<<<<<< HEAD
fprintf(fd," ((__m512i*)cnProcBufRes)[%d+i] = _mm512_sign_epi16(min, sgn);\n",(lut_startAddrCnGroups[5]>>31)+(j*bitOffsetInGroup));
=======
fprintf(fd," ((__m512i*)cnProcBufRes)[%d+i] = _mm512_sign_epi16(min, sgn);\n",(lut_startAddrCnGroups[5]>>7)+(j*bitOffsetInGroup));
>>>>>>> 6d9ceaa712033dc4f28050886be43572df7c2b68
fprintf(fd," ((__m512i*)cnProcBufRes)[%d+i] = _mm512_sign_epi16(min, sgn);\n",(lut_startAddrCnGroups[5]>>5)+(j*bitOffsetInGroup));
fprintf(fd," }\n");
}
}
......@@ -628,34 +486,23 @@ void nrLDPC_cnProc_BG1_generator_AVX512(uint16_t Z,int R)
// Process group with 9 BNs
fprintf(fd,"//Process group with 9 BNs\n");
// Offset is 2*384/64 = 12
/*const uint8_t lut_idxCnProcG9[9][8] = {{12,24,36,48,60,72,84,96}, {0,24,36,48,60,72,84,96},
const uint8_t lut_idxCnProcG9[9][8] = {{12,24,36,48,60,72,84,96}, {0,24,36,48,60,72,84,96},
{0,12,36,48,60,72,84,96}, {0,12,24,48,60,72,84,96},
{0,12,24,36,60,72,84,96}, {0,12,24,36,48,72,84,96},
{0,12,24,36,48,60,84,96}, {0,12,24,36,48,60,72,96},
{0,12,24,36,48,60,72,84}};*/
{0,12,24,36,48,60,72,84}};
const uint8_t lut_idxCnProcG9[9][8] = {{24,48,72,96,120,144,168,192}, {0,48,72,96,120,144,168,192},
{0,24,72,96,120,144,168,192}, {0,24,48,96,120,144,168,192},
{0,24,48,72,120,144,168,192}, {0,24,48,72,96,144,168,192},
{0,24,48,72,96,120,168,192}, {0,24,48,72,96,120,144,192},
{0,24,48,72,96,120,144,168}};
if (lut_numCnInCnGroups[6] > 0)
{
// Number of groups of 64 CNs for parallel processing
// Ceil for values not divisible by 64
<<<<<<< HEAD
M = (lut_numCnInCnGroups[6]*Z + 63)>>31;
M = (lut_numCnInCnGroups[6]*Z + 63)>>5;
// Set the offset to each bit within a group in terms of 64 Byte
bitOffsetInGroup = (lut_numCnInCnGroups_BG1_R13[6]*NR_LDPC_ZMAX)>>31;
=======
M = (lut_numCnInCnGroups[6]*Z + 31)>>7;
// Set the offset to each bit within a group in terms of 64 Byte
bitOffsetInGroup = (lut_numCnInCnGroups_BG1_R13[6]*NR_LDPC_ZMAX)>>7;
>>>>>>> 6d9ceaa712033dc4f28050886be43572df7c2b68
bitOffsetInGroup = (lut_numCnInCnGroups_BG1_R13[6]*NR_LDPC_ZMAX)>>5;
// Set pointers to start of group 9
......@@ -675,11 +522,7 @@ void nrLDPC_cnProc_BG1_generator_AVX512(uint16_t Z,int R)
fprintf(fd," for (int i=0;i<%d;i++) {\n",M);
// Abs and sign of 64 CNs (first BN)
// zmm0 = p_cnProcBuf[lut_idxCnProcG3[j][0] + i];
<<<<<<< HEAD
fprintf(fd," zmm0 = ((__m512i*)cnProcBuf)[%d+i];\n",(lut_startAddrCnGroups[6]>>31)+lut_idxCnProcG9[j][0]);
=======
fprintf(fd," zmm0 = ((__m512i*)cnProcBuf)[%d+i];\n",(lut_startAddrCnGroups[6]>>7)+lut_idxCnProcG9[j][0]);
>>>>>>> 6d9ceaa712033dc4f28050886be43572df7c2b68
fprintf(fd," zmm0 = ((__m512i*)cnProcBuf)[%d+i];\n",(lut_startAddrCnGroups[6]>>5)+lut_idxCnProcG9[j][0]);
// sgn = _mm512_sign_epi16(ones, zmm0);
fprintf(fd," sgn = _mm512_sign_epi16(ones, zmm0);\n");
// min = _mm512_abs_epi8(zmm0);
......@@ -689,11 +532,7 @@ void nrLDPC_cnProc_BG1_generator_AVX512(uint16_t Z,int R)
// Loop over BNs
for (k=1; k<8; k++)
{
<<<<<<< HEAD
fprintf(fd," zmm0 = ((__m512i*)cnProcBuf)[%d+i];\n",(lut_startAddrCnGroups[6]>>31)+lut_idxCnProcG9[j][k]);
=======
fprintf(fd," zmm0 = ((__m512i*)cnProcBuf)[%d+i];\n",(lut_startAddrCnGroups[6]>>7)+lut_idxCnProcG9[j][k]);
>>>>>>> 6d9ceaa712033dc4f28050886be43572df7c2b68
fprintf(fd," zmm0 = ((__m512i*)cnProcBuf)[%d+i];\n",(lut_startAddrCnGroups[6]>>5)+lut_idxCnProcG9[j][k]);
// min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0));
fprintf(fd," min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0));\n");
......@@ -707,11 +546,7 @@ void nrLDPC_cnProc_BG1_generator_AVX512(uint16_t Z,int R)
fprintf(fd," min = _mm512_min_epu8(min, maxLLR);\n");
// *p_cnProcBufResBit = _mm512_sign_epi16(min, sgn);
// p_cnProcBufResBit++;
<<<<<<< HEAD
fprintf(fd," ((__m512i*)cnProcBufRes)[%d+i] = _mm512_sign_epi16(min, sgn);\n",(lut_startAddrCnGroups[6]>>31)+(j*bitOffsetInGroup));
=======
fprintf(fd," ((__m512i*)cnProcBufRes)[%d+i] = _mm512_sign_epi16(min, sgn);\n",(lut_startAddrCnGroups[6]>>7)+(j*bitOffsetInGroup));
>>>>>>> 6d9ceaa712033dc4f28050886be43572df7c2b68
fprintf(fd," ((__m512i*)cnProcBufRes)[%d+i] = _mm512_sign_epi16(min, sgn);\n",(lut_startAddrCnGroups[6]>>5)+(j*bitOffsetInGroup));
fprintf(fd," }\n");
}
}
......@@ -720,17 +555,11 @@ void nrLDPC_cnProc_BG1_generator_AVX512(uint16_t Z,int R)
// Process group with 10 BNs
fprintf(fd,"//Process group with 10 BNs\n");
// Offset is 1*384/64 = 6
/* const uint8_t lut_idxCnProcG10[10][9] = {{6,12,18,24,30,36,42,48,54}, {0,12,18,24,30,36,42,48,54},
const uint8_t lut_idxCnProcG10[10][9] = {{6,12,18,24,30,36,42,48,54}, {0,12,18,24,30,36,42,48,54},
{0,6,18,24,30,36,42,48,54}, {0,6,12,24,30,36,42,48,54},
{0,6,12,18,30,36,42,48,54}, {0,6,12,18,24,36,42,48,54},
{0,6,12,18,24,30,42,48,54}, {0,6,12,18,24,30,36,48,54},
{0,6,12,18,24,30,36,42,54}, {0,6,12,36,24,30,36,42,48}};*/
const uint8_t lut_idxCnProcG10[10][9] = {{12,24,36,48,60,72,84,96,108}, {0,24,36,48,60,72,84,96,108},
{0,12,36,48,60,72,84,96,108}, {0,12,24,48,60,72,84,96,108},
{0,12,24,36,60,72,84,96,108}, {0,12,24,36,48,72,84,96,108},
{0,12,24,36,48,60,84,96,108}, {0,12,24,36,48,60,72,96,108},
{0,12,24,36,48,60,72,84,108}, {0,12,24,36,48,60,72,84,96}};
{0,6,12,18,24,30,36,42,54}, {0,6,12,36,24,30,36,42,48}};
......@@ -740,17 +569,10 @@ void nrLDPC_cnProc_BG1_generator_AVX512(uint16_t Z,int R)
{
// Number of groups of 64 CNs for parallel processing
// Ceil for values not divisible by 64
<<<<<<< HEAD
M = (lut_numCnInCnGroups[7]*Z + 63)>>31;
M = (lut_numCnInCnGroups[7]*Z + 63)>>5;
// Set the offset to each bit within a group in terms of 64 Byte
bitOffsetInGroup = (lut_numCnInCnGroups_BG1_R13[7]*NR_LDPC_ZMAX)>>31;
=======
M = (lut_numCnInCnGroups[7]*Z + 31)>>7;
// Set the offset to each bit within a group in terms of 64 Byte
bitOffsetInGroup = (lut_numCnInCnGroups_BG1_R13[7]*NR_LDPC_ZMAX)>>7;
>>>>>>> 6d9ceaa712033dc4f28050886be43572df7c2b68
bitOffsetInGroup = (lut_numCnInCnGroups_BG1_R13[7]*NR_LDPC_ZMAX)>>5;
// Set pointers to start of group 10
......@@ -770,11 +592,7 @@ void nrLDPC_cnProc_BG1_generator_AVX512(uint16_t Z,int R)
fprintf(fd," for (int i=0;i<%d;i++) {\n",M);
// Abs and sign of 64 CNs (first BN)
// zmm0 = p_cnProcBuf[lut_idxCnProcG3[j][0] + i];
<<<<<<< HEAD
fprintf(fd," zmm0 = ((__m512i*)cnProcBuf)[%d+i];\n",(lut_startAddrCnGroups[7]>>31)+lut_idxCnProcG10[j][0]);
=======
fprintf(fd," zmm0 = ((__m512i*)cnProcBuf)[%d+i];\n",(lut_startAddrCnGroups[7]>>7)+lut_idxCnProcG10[j][0]);
>>>>>>> 6d9ceaa712033dc4f28050886be43572df7c2b68
fprintf(fd," zmm0 = ((__m512i*)cnProcBuf)[%d+i];\n",(lut_startAddrCnGroups[7]>>5)+lut_idxCnProcG10[j][0]);
// sgn = _mm512_sign_epi16(ones, zmm0);
fprintf(fd," sgn = _mm512_sign_epi16(ones, zmm0);\n");
// min = _mm512_abs_epi8(zmm0);
......@@ -784,11 +602,7 @@ void nrLDPC_cnProc_BG1_generator_AVX512(uint16_t Z,int R)
// Loop over BNs
for (k=1; k<9; k++)
{
<<<<<<< HEAD
fprintf(fd," zmm0 = ((__m512i*)cnProcBuf)[%d+i];\n",(lut_startAddrCnGroups[7]>>31)+lut_idxCnProcG10[j][k]);
=======
fprintf(fd," zmm0 = ((__m512i*)cnProcBuf)[%d+i];\n",(lut_startAddrCnGroups[7]>>7)+lut_idxCnProcG10[j][k]);
>>>>>>> 6d9ceaa712033dc4f28050886be43572df7c2b68
fprintf(fd," zmm0 = ((__m512i*)cnProcBuf)[%d+i];\n",(lut_startAddrCnGroups[7]>>5)+lut_idxCnProcG10[j][k]);
// min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0));
fprintf(fd," min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0));\n");
......@@ -802,11 +616,7 @@ void nrLDPC_cnProc_BG1_generator_AVX512(uint16_t Z,int R)
fprintf(fd," min = _mm512_min_epu8(min, maxLLR);\n");
// *p_cnProcBufResBit = _mm512_sign_epi16(min, sgn);
// p_cnProcBufResBit++;
<<<<<<< HEAD
fprintf(fd," ((__m512i*)cnProcBufRes)[%d+i] = _mm512_sign_epi16(min, sgn);\n",(lut_startAddrCnGroups[7]>>31)+(j*bitOffsetInGroup));
=======
fprintf(fd," ((__m512i*)cnProcBufRes)[%d+i] = _mm512_sign_epi16(min, sgn);\n",(lut_startAddrCnGroups[7]>>7)+(j*bitOffsetInGroup));
>>>>>>> 6d9ceaa712033dc4f28050886be43572df7c2b68
fprintf(fd," ((__m512i*)cnProcBufRes)[%d+i] = _mm512_sign_epi16(min, sgn);\n",(lut_startAddrCnGroups[7]>>5)+(j*bitOffsetInGroup));
fprintf(fd," }\n");
}
}
......@@ -816,7 +626,7 @@ void nrLDPC_cnProc_BG1_generator_AVX512(uint16_t Z,int R)
// Process group with 19 BNs
fprintf(fd,"//Process group with 19 BNs\n");
// Offset is 4*384/64 = 24
/* const uint16_t lut_idxCnProcG19[19][18] = {{24,48,72,96,120,144,168,192,216,240,264,288,312,336,360,384,408,432}, {0,48,72,96,120,144,168,192,216,240,264,288,312,336,360,384,408,432},
const uint16_t lut_idxCnProcG19[19][18] = {{24,48,72,96,120,144,168,192,216,240,264,288,312,336,360,384,408,432}, {0,48,72,96,120,144,168,192,216,240,264,288,312,336,360,384,408,432},
{0,24,72,96,120,144,168,192,216,240,264,288,312,336,360,384,408,432}, {0,24,48,96,120,144,168,192,216,240,264,288,312,336,360,384,408,432},
{0,24,48,72,120,144,168,192,216,240,264,288,312,336,360,384,408,432}, {0,24,48,72,96,144,168,192,216,240,264,288,312,336,360,384,408,432},
{0,24,48,72,96,120,168,192,216,240,264,288,312,336,360,384,408,432}, {0,24,48,72,96,120,144,192,216,240,264,288,312,336,360,384,408,432},
......@@ -825,35 +635,17 @@ void nrLDPC_cnProc_BG1_generator_AVX512(uint16_t Z,int R)
{0,24,48,72,96,120,144,168,192,216,240,264,312,336,360,384,408,432}, {0,24,48,72,96,120,144,168,192,216,240,264,288,336,360,384,408,432},
{0,24,48,72,96,120,144,168,192,216,240,264,288,312,360,384,408,432}, {0,24,48,72,96,120,144,168,192,216,240,264,288,312,336,384,408,432},
{0,24,48,72,96,120,144,168,192,216,240,264,288,312,336,360,408,432}, {0,24,48,72,96,120,144,168,192,216,240,264,288,312,336,360,384,432},
{0,24,48,72,96,120,144,168,192,216,240,264,288,312,336,360,384,408}};*/
const uint16_t lut_idxCnProcG19[19][18] = {{48,96,144,192,240,288,336,384,432,480,528,576,624,672,720,768,816,864}, {0,96,144,192,240,288,336,384,432,480,528,576,624,672,720,768,816,864},
{0,48,144,192,240,288,336,384,432,480,528,576,624,672,720,768,816,864}, {0,48,96,192,240,288,336,384,432,480,528,576,624,672,720,768,816,864},
{0,48,96,144,240,288,336,384,432,480,528,576,624,672,720,768,816,864}, {0,48,96,144,192,288,336,384,432,480,528,576,624,672,720,768,816,864},
{0,48,96,144,192,240,336,384,432,480,528,576,624,672,720,768,816,864}, {0,48,96,144,192,240,288,384,432,480,528,576,624,672,720,768,816,864},
{0,48,96,144,192,240,288,336,432,480,528,576,624,672,720,768,816,864}, {0,48,96,144,192,240,288,336,384,480,528,576,624,672,720,768,816,864},
{0,48,96,144,192,240,288,336,384,432,528,576,624,672,720,768,816,864}, {0,48,96,144,192,240,288,336,384,432,480,576,624,672,720,768,816,864},
{0,48,96,144,192,240,288,336,384,432,480,528,624,672,720,768,816,864}, {0,48,96,144,192,240,288,336,384,432,480,528,576,672,720,768,816,864},
{0,48,96,144,192,240,288,336,384,432,480,528,576,624,720,768,816,864}, {0,48,96,144,192,240,288,336,384,432,480,528,576,624,672,768,816,864},
{0,48,96,144,192,240,288,336,384,432,480,528,576,624,672,720,816,864}, {0,48,96,144,192,240,288,336,384,432,480,528,576,624,672,720,768,864},
{0,48,96,144,192,240,288,336,384,432,480,528,576,624,672,720,768,816}};
{0,24,48,72,96,120,144,168,192,216,240,264,288,312,336,360,384,408}};
if (lut_numCnInCnGroups[8] > 0)
{
// Number of groups of 64 CNs for parallel processing
// Ceil for values not divisible by 64
<<<<<<< HEAD
M = (lut_numCnInCnGroups[8]*Z + 63)>>31;
// Set the offset to each bit within a group in terms of 64 Byte
bitOffsetInGroup = (lut_numCnInCnGroups_BG1_R13[8]*NR_LDPC_ZMAX)>>31;
=======
M = (lut_numCnInCnGroups[8]*Z + 31)>>7;
M = (lut_numCnInCnGroups[8]*Z + 63)>>5;
// Set the offset to each bit within a group in terms of 64 Byte
bitOffsetInGroup = (lut_numCnInCnGroups_BG1_R13[8]*NR_LDPC_ZMAX)>>7;
>>>>>>> 6d9ceaa712033dc4f28050886be43572df7c2b68
bitOffsetInGroup = (lut_numCnInCnGroups_BG1_R13[8]*NR_LDPC_ZMAX)>>5;
// Set pointers to start of group 19
......@@ -873,11 +665,7 @@ const uint16_t lut_idxCnProcG19[19][18] = {{48,96,144,192,240,288,336,384,432,48
fprintf(fd," for (int i=0;i<%d;i++) {\n",M);
// Abs and sign of 64 CNs (first BN)
// zmm0 = p_cnProcBuf[lut_idxCnProcG3[j][0] + i];
<<<<<<< HEAD
fprintf(fd," zmm0 = ((__m512i*)cnProcBuf)[%d+i];\n",(lut_startAddrCnGroups[8]>>31)+lut_idxCnProcG19[j][0]);
=======
fprintf(fd," zmm0 = ((__m512i*)cnProcBuf)[%d+i];\n",(lut_startAddrCnGroups[8]>>7)+lut_idxCnProcG19[j][0]);
>>>>>>> 6d9ceaa712033dc4f28050886be43572df7c2b68
fprintf(fd," zmm0 = ((__m512i*)cnProcBuf)[%d+i];\n",(lut_startAddrCnGroups[8]>>5)+lut_idxCnProcG19[j][0]);
// sgn = _mm512_sign_epi16(ones, zmm0);
fprintf(fd," sgn = _mm512_sign_epi16(ones, zmm0);\n");
// min = _mm512_abs_epi8(zmm0);
......@@ -887,11 +675,7 @@ const uint16_t lut_idxCnProcG19[19][18] = {{48,96,144,192,240,288,336,384,432,48
// Loop over BNs
for (k=1; k<18; k++)
{
<<<<<<< HEAD
fprintf(fd," zmm0 = ((__m512i*)cnProcBuf)[%d+i];\n",(lut_startAddrCnGroups[8]>>31)+lut_idxCnProcG19[j][k]);
=======
fprintf(fd," zmm0 = ((__m512i*)cnProcBuf)[%d+i];\n",(lut_startAddrCnGroups[8]>>7)+lut_idxCnProcG19[j][k]);
>>>>>>> 6d9ceaa712033dc4f28050886be43572df7c2b68
fprintf(fd," zmm0 = ((__m512i*)cnProcBuf)[%d+i];\n",(lut_startAddrCnGroups[8]>>5)+lut_idxCnProcG19[j][k]);
// min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0));
fprintf(fd," min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0));\n");
......@@ -905,11 +689,7 @@ const uint16_t lut_idxCnProcG19[19][18] = {{48,96,144,192,240,288,336,384,432,48
fprintf(fd," min = _mm512_min_epu8(min, maxLLR);\n");
// *p_cnProcBufResBit = _mm512_sign_epi16(min, sgn);
// p_cnProcBufResBit++;
<<<<<<< HEAD
fprintf(fd," ((__m512i*)cnProcBufRes)[%d+i] = _mm512_sign_epi16(min, sgn);\n",(lut_startAddrCnGroups[8]>>31)+(j*bitOffsetInGroup));
=======
fprintf(fd," ((__m512i*)cnProcBufRes)[%d+i] = _mm512_sign_epi16(min, sgn);\n",(lut_startAddrCnGroups[8]>>7)+(j*bitOffsetInGroup));
>>>>>>> 6d9ceaa712033dc4f28050886be43572df7c2b68
fprintf(fd," ((__m512i*)cnProcBufRes)[%d+i] = _mm512_sign_epi16(min, sgn);\n",(lut_startAddrCnGroups[8]>>5)+(j*bitOffsetInGroup));
fprintf(fd," }\n");
}
}
......
Markdown is supported
0%
or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment