Commit 5e5bc1a9 authored by Sy's avatar Sy

Effective management of all values ​​of Z (lifting size) and all R (coding rate) for BG1 & BG2 |

some improvement in decoding time | (bnProcPc reduces 2x)
parent c9127621
...@@ -61,9 +61,9 @@ static inline void nrLDPC_cnProc_BG2_AVX512(t_nrLDPC_lut* p_lut, t_nrLDPC_procBu ...@@ -61,9 +61,9 @@ static inline void nrLDPC_cnProc_BG2_AVX512(t_nrLDPC_lut* p_lut, t_nrLDPC_procBu
// Offset to each bit within a group in terms of 32 Byte // Offset to each bit within a group in terms of 32 Byte
uint32_t bitOffsetInGroup; uint32_t bitOffsetInGroup;
__m512i zmm0, min, sgn, zeros,maxLLR; __m512i zmm0, min, sgn, zeros;
zeros = _mm512_setzero_si512(); zeros = _mm512_setzero_si512();
maxLLR = _mm512_set1_epi8((char)127); // maxLLR = _mm512_set1_epi8((char)127);
__m512i* p_cnProcBufResBit; __m512i* p_cnProcBufResBit;
const __m512i* p_ones = (__m512i*) ones512_epi8; const __m512i* p_ones = (__m512i*) ones512_epi8;
...@@ -95,8 +95,8 @@ static inline void nrLDPC_cnProc_BG2_AVX512(t_nrLDPC_lut* p_lut, t_nrLDPC_procBu ...@@ -95,8 +95,8 @@ static inline void nrLDPC_cnProc_BG2_AVX512(t_nrLDPC_lut* p_lut, t_nrLDPC_procBu
// Set of results pointer to correct BN address // Set of results pointer to correct BN address
p_cnProcBufResBit = p_cnProcBufRes + (j*bitOffsetInGroup); p_cnProcBufResBit = p_cnProcBufRes + (j*bitOffsetInGroup);
__m512i *pj0 = &p_cnProcBuf[lut_idxCnProcG3[j][0]/2]; __m512i *pj0 = &p_cnProcBuf[(lut_idxCnProcG3[j][0]/2)];
__m512i *pj1 = &p_cnProcBuf[lut_idxCnProcG3[j][1]/2]; __m512i *pj1 = &p_cnProcBuf[(lut_idxCnProcG3[j][1]/2)];
// Loop over CNs // Loop over CNs
for (i=0; i<M; i++) for (i=0; i<M; i++)
...@@ -108,7 +108,7 @@ static inline void nrLDPC_cnProc_BG2_AVX512(t_nrLDPC_lut* p_lut, t_nrLDPC_procBu ...@@ -108,7 +108,7 @@ static inline void nrLDPC_cnProc_BG2_AVX512(t_nrLDPC_lut* p_lut, t_nrLDPC_procBu
min = _mm512_abs_epi8(zmm0); min = _mm512_abs_epi8(zmm0);
// 32 CNs of second BN // 32 CNs of second BN
// zmm0 = p_cnProcBuf[lut_idxCnProcG3[j][1] + i]; // zmm0 = p_cnProcBuf[(lut_idxCnProcG3[j][1]/2) + i];
zmm0 = pj1[i]; zmm0 = pj1[i];
min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0));
sgn = _mm512_xor_si512(sgn, zmm0); sgn = _mm512_xor_si512(sgn, zmm0);
...@@ -150,14 +150,14 @@ static inline void nrLDPC_cnProc_BG2_AVX512(t_nrLDPC_lut* p_lut, t_nrLDPC_procBu ...@@ -150,14 +150,14 @@ static inline void nrLDPC_cnProc_BG2_AVX512(t_nrLDPC_lut* p_lut, t_nrLDPC_procBu
for (i=0; i<M; i++) for (i=0; i<M; i++)
{ {
// Abs and sign of 32 CNs (first BN) // Abs and sign of 32 CNs (first BN)
zmm0 = p_cnProcBuf[lut_idxCnProcG4[j][0]/2 + i]; zmm0 = p_cnProcBuf[(lut_idxCnProcG4[j][0]/2) + i];
sgn = _mm512_xor_si512(*p_ones, zmm0); sgn = _mm512_xor_si512(*p_ones, zmm0);
min = _mm512_abs_epi8(zmm0); min = _mm512_abs_epi8(zmm0);
// Loop over BNs // Loop over BNs
for (k=1; k<3; k++) for (k=1; k<3; k++)
{ {
zmm0 = p_cnProcBuf[lut_idxCnProcG4[j][k]/2 + i]; zmm0 = p_cnProcBuf[(lut_idxCnProcG4[j][k]/2) + i];
min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0));
sgn = _mm512_xor_si512(sgn, zmm0); sgn = _mm512_xor_si512(sgn, zmm0);
} }
...@@ -199,14 +199,14 @@ static inline void nrLDPC_cnProc_BG2_AVX512(t_nrLDPC_lut* p_lut, t_nrLDPC_procBu ...@@ -199,14 +199,14 @@ static inline void nrLDPC_cnProc_BG2_AVX512(t_nrLDPC_lut* p_lut, t_nrLDPC_procBu
for (i=0; i<M; i++) for (i=0; i<M; i++)
{ {
// Abs and sign of 32 CNs (first BN) // Abs and sign of 32 CNs (first BN)
zmm0 = p_cnProcBuf[lut_idxCnProcG5[j][0]/2 + i]; zmm0 = p_cnProcBuf[(lut_idxCnProcG5[j][0]/2) + i];
sgn = _mm512_xor_si512(*p_ones, zmm0); sgn = _mm512_xor_si512(*p_ones, zmm0);
min = _mm512_abs_epi8(zmm0); min = _mm512_abs_epi8(zmm0);
// Loop over BNs // Loop over BNs
for (k=1; k<4; k++) for (k=1; k<4; k++)
{ {
zmm0 = p_cnProcBuf[lut_idxCnProcG5[j][k]/2 + i]; zmm0 = p_cnProcBuf[(lut_idxCnProcG5[j][k]/2) + i];
min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0));
sgn = _mm512_xor_si512(sgn, zmm0); sgn = _mm512_xor_si512(sgn, zmm0);
} }
...@@ -249,14 +249,14 @@ static inline void nrLDPC_cnProc_BG2_AVX512(t_nrLDPC_lut* p_lut, t_nrLDPC_procBu ...@@ -249,14 +249,14 @@ static inline void nrLDPC_cnProc_BG2_AVX512(t_nrLDPC_lut* p_lut, t_nrLDPC_procBu
for (i=0; i<M; i++) for (i=0; i<M; i++)
{ {
// Abs and sign of 32 CNs (first BN) // Abs and sign of 32 CNs (first BN)
zmm0 = p_cnProcBuf[lut_idxCnProcG6[j][0]/2 + i]; zmm0 = p_cnProcBuf[(lut_idxCnProcG6[j][0]/2) + i];
sgn = _mm512_xor_si512(*p_ones, zmm0); sgn = _mm512_xor_si512(*p_ones, zmm0);
min = _mm512_abs_epi8(zmm0); min = _mm512_abs_epi8(zmm0);
// Loop over BNs // Loop over BNs
for (k=1; k<5; k++) for (k=1; k<5; k++)
{ {
zmm0 = p_cnProcBuf[lut_idxCnProcG6[j][k]/2 + i]; zmm0 = p_cnProcBuf[(lut_idxCnProcG6[j][k]/2) + i];
min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0));
sgn = _mm512_xor_si512(sgn, zmm0); sgn = _mm512_xor_si512(sgn, zmm0);
} }
...@@ -300,14 +300,14 @@ static inline void nrLDPC_cnProc_BG2_AVX512(t_nrLDPC_lut* p_lut, t_nrLDPC_procBu ...@@ -300,14 +300,14 @@ static inline void nrLDPC_cnProc_BG2_AVX512(t_nrLDPC_lut* p_lut, t_nrLDPC_procBu
for (i=0; i<M; i++) for (i=0; i<M; i++)
{ {
// Abs and sign of 32 CNs (first BN) // Abs and sign of 32 CNs (first BN)
zmm0 = p_cnProcBuf[lut_idxCnProcG8[j][0]/2 + i]; zmm0 = p_cnProcBuf[(lut_idxCnProcG8[j][0]/2) + i];
sgn = _mm512_xor_si512(*p_ones, zmm0); sgn = _mm512_xor_si512(*p_ones, zmm0);
min = _mm512_abs_epi8(zmm0); min = _mm512_abs_epi8(zmm0);
// Loop over BNs // Loop over BNs
for (k=1; k<7; k++) for (k=1; k<7; k++)
{ {
zmm0 = p_cnProcBuf[lut_idxCnProcG8[j][k]/2 + i]; zmm0 = p_cnProcBuf[(lut_idxCnProcG8[j][k]/2) + i];
min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0));
sgn = _mm512_xor_si512(sgn, zmm0); sgn = _mm512_xor_si512(sgn, zmm0);
} }
...@@ -352,14 +352,14 @@ static inline void nrLDPC_cnProc_BG2_AVX512(t_nrLDPC_lut* p_lut, t_nrLDPC_procBu ...@@ -352,14 +352,14 @@ static inline void nrLDPC_cnProc_BG2_AVX512(t_nrLDPC_lut* p_lut, t_nrLDPC_procBu
for (i=0; i<M; i++) for (i=0; i<M; i++)
{ {
// Abs and sign of 32 CNs (first BN) // Abs and sign of 32 CNs (first BN)
zmm0 = p_cnProcBuf[lut_idxCnProcG10[j][0]/2 + i]; zmm0 = p_cnProcBuf[(lut_idxCnProcG10[j][0]/2) + i];
sgn = _mm512_xor_si512(*p_ones, zmm0); sgn = _mm512_xor_si512(*p_ones, zmm0);
min = _mm512_abs_epi8(zmm0); min = _mm512_abs_epi8(zmm0);
// Loop over BNs // Loop over BNs
for (k=1; k<9; k++) for (k=1; k<9; k++)
{ {
zmm0 = p_cnProcBuf[lut_idxCnProcG10[j][k]/2 + i]; zmm0 = p_cnProcBuf[(lut_idxCnProcG10[j][k]/2) + i];
min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0)); min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0));
sgn = _mm512_xor_si512(sgn, zmm0); sgn = _mm512_xor_si512(sgn, zmm0);
} }
...@@ -736,10 +736,10 @@ static inline void nrLDPC_cnProc_BG1_AVX512(t_nrLDPC_lut* p_lut, t_nrLDPC_procBu ...@@ -736,10 +736,10 @@ static inline void nrLDPC_cnProc_BG1_AVX512(t_nrLDPC_lut* p_lut, t_nrLDPC_procBu
// Offset to each bit within a group in terms of 32 Byte // Offset to each bit within a group in terms of 32 Byte
uint32_t bitOffsetInGroup; uint32_t bitOffsetInGroup;
__m512i zmm0, min, sgn, zeros,maxLLR; __m512i zmm0, min, sgn, zeros;
zeros = _mm512_setzero_si512(); zeros = _mm512_setzero_si512();
maxLLR = _mm512_set1_epi8((char)127); // maxLLR = _mm512_set1_epi8((char)127);
__m512i* p_cnProcBufResBit; __m512i* p_cnProcBufResBit;
......
This source diff could not be displayed because it is too large. You can view the blob instead.
...@@ -1026,7 +1026,7 @@ static inline void nrLDPC_bn2cnProcBuf_BG1(t_nrLDPC_lut* p_lut, t_nrLDPC_procBuf ...@@ -1026,7 +1026,7 @@ static inline void nrLDPC_bn2cnProcBuf_BG1(t_nrLDPC_lut* p_lut, t_nrLDPC_procBuf
// CN group with 3 BNs // CN group with 3 BNs
bitOffsetInGroup = lut_numCnInCnGroups_BG1_R13[0]*NR_LDPC_ZMAX; bitOffsetInGroup = lut_numCnInCnGroups_BG1_R13[0]*NR_LDPC_ZMAX;
#pragma omp simd // #pragma omp simd
for (j=0;j<2; j++) for (j=0;j<2; j++)
{ {
p_cnProcBuf = &cnProcBuf[lut_startAddrCnGroups[0] + j*bitOffsetInGroup]; p_cnProcBuf = &cnProcBuf[lut_startAddrCnGroups[0] + j*bitOffsetInGroup];
...@@ -1041,7 +1041,7 @@ static inline void nrLDPC_bn2cnProcBuf_BG1(t_nrLDPC_lut* p_lut, t_nrLDPC_procBuf ...@@ -1041,7 +1041,7 @@ static inline void nrLDPC_bn2cnProcBuf_BG1(t_nrLDPC_lut* p_lut, t_nrLDPC_procBuf
for (j=0; j<3; j++) for (j=0; j<3; j++)
{ {
p_cnProcBuf = &cnProcBuf[lut_startAddrCnGroups[1] + j*bitOffsetInGroup]; p_cnProcBuf = &cnProcBuf[lut_startAddrCnGroups[1] + j*bitOffsetInGroup];
#pragma omp simd // #pragma omp simd
for (i=0; i<lut_numCnInCnGroups[1]; i++) for (i=0; i<lut_numCnInCnGroups[1]; i++)
{ {
idxBn = lut_startAddrBnProcBuf_CNG4[j][i] + lut_bnPosBnProcBuf_CNG4[j][i]*Z; idxBn = lut_startAddrBnProcBuf_CNG4[j][i] + lut_bnPosBnProcBuf_CNG4[j][i]*Z;
...@@ -1057,7 +1057,7 @@ static inline void nrLDPC_bn2cnProcBuf_BG1(t_nrLDPC_lut* p_lut, t_nrLDPC_procBuf ...@@ -1057,7 +1057,7 @@ static inline void nrLDPC_bn2cnProcBuf_BG1(t_nrLDPC_lut* p_lut, t_nrLDPC_procBuf
for (j=0; j<4; j++) for (j=0; j<4; j++)
{ {
p_cnProcBuf = &cnProcBuf[lut_startAddrCnGroups[2] + j*bitOffsetInGroup]; p_cnProcBuf = &cnProcBuf[lut_startAddrCnGroups[2] + j*bitOffsetInGroup];
#pragma omp simd // #pragma omp simd
for (i=0; i<lut_numCnInCnGroups[2]; i++) for (i=0; i<lut_numCnInCnGroups[2]; i++)
{ {
idxBn = lut_startAddrBnProcBuf_CNG5[j][i] + lut_bnPosBnProcBuf_CNG5[j][i]*Z; idxBn = lut_startAddrBnProcBuf_CNG5[j][i] + lut_bnPosBnProcBuf_CNG5[j][i]*Z;
...@@ -1074,7 +1074,7 @@ static inline void nrLDPC_bn2cnProcBuf_BG1(t_nrLDPC_lut* p_lut, t_nrLDPC_procBuf ...@@ -1074,7 +1074,7 @@ static inline void nrLDPC_bn2cnProcBuf_BG1(t_nrLDPC_lut* p_lut, t_nrLDPC_procBuf
for (j=0; j<5; j++) for (j=0; j<5; j++)
{ {
p_cnProcBuf = &cnProcBuf[lut_startAddrCnGroups[3] + j*bitOffsetInGroup]; p_cnProcBuf = &cnProcBuf[lut_startAddrCnGroups[3] + j*bitOffsetInGroup];
#pragma omp simd // #pragma omp simd
for (i=0; i<lut_numCnInCnGroups[3]; i++) for (i=0; i<lut_numCnInCnGroups[3]; i++)
{ {
idxBn = lut_startAddrBnProcBuf_CNG6[j][i] + lut_bnPosBnProcBuf_CNG6[j][i]*Z; idxBn = lut_startAddrBnProcBuf_CNG6[j][i] + lut_bnPosBnProcBuf_CNG6[j][i]*Z;
...@@ -1092,7 +1092,7 @@ static inline void nrLDPC_bn2cnProcBuf_BG1(t_nrLDPC_lut* p_lut, t_nrLDPC_procBuf ...@@ -1092,7 +1092,7 @@ static inline void nrLDPC_bn2cnProcBuf_BG1(t_nrLDPC_lut* p_lut, t_nrLDPC_procBuf
{ {
p_cnProcBuf = &cnProcBuf[lut_startAddrCnGroups[4] + j*bitOffsetInGroup]; p_cnProcBuf = &cnProcBuf[lut_startAddrCnGroups[4] + j*bitOffsetInGroup];
#pragma omp simd // #pragma omp simd
for (i=0; i<lut_numCnInCnGroups[4]; i++) for (i=0; i<lut_numCnInCnGroups[4]; i++)
{ {
idxBn = lut_startAddrBnProcBuf_CNG7[j][i] + lut_bnPosBnProcBuf_CNG7[j][i]*Z; idxBn = lut_startAddrBnProcBuf_CNG7[j][i] + lut_bnPosBnProcBuf_CNG7[j][i]*Z;
...@@ -1105,7 +1105,7 @@ static inline void nrLDPC_bn2cnProcBuf_BG1(t_nrLDPC_lut* p_lut, t_nrLDPC_procBuf ...@@ -1105,7 +1105,7 @@ static inline void nrLDPC_bn2cnProcBuf_BG1(t_nrLDPC_lut* p_lut, t_nrLDPC_procBuf
// CN group with 8 BNs // CN group with 8 BNs
bitOffsetInGroup = lut_numCnInCnGroups_BG1_R13[5]*NR_LDPC_ZMAX; bitOffsetInGroup = lut_numCnInCnGroups_BG1_R13[5]*NR_LDPC_ZMAX;
#pragma omp simd // #pragma omp simd
for (j=0; j<7; j++) for (j=0; j<7; j++)
{ {
p_cnProcBuf = &cnProcBuf[lut_startAddrCnGroups[5] + j*bitOffsetInGroup]; p_cnProcBuf = &cnProcBuf[lut_startAddrCnGroups[5] + j*bitOffsetInGroup];
...@@ -1125,7 +1125,7 @@ static inline void nrLDPC_bn2cnProcBuf_BG1(t_nrLDPC_lut* p_lut, t_nrLDPC_procBuf ...@@ -1125,7 +1125,7 @@ static inline void nrLDPC_bn2cnProcBuf_BG1(t_nrLDPC_lut* p_lut, t_nrLDPC_procBuf
for (j=0; j<8; j++) for (j=0; j<8; j++)
{ {
p_cnProcBuf = &cnProcBuf[lut_startAddrCnGroups[6] + j*bitOffsetInGroup]; p_cnProcBuf = &cnProcBuf[lut_startAddrCnGroups[6] + j*bitOffsetInGroup];
#pragma omp simd // #pragma omp simd
for (i=0; i<lut_numCnInCnGroups[6]; i++) for (i=0; i<lut_numCnInCnGroups[6]; i++)
{ {
idxBn = lut_startAddrBnProcBuf_CNG9[j][i] + lut_bnPosBnProcBuf_CNG9[j][i]*Z; idxBn = lut_startAddrBnProcBuf_CNG9[j][i] + lut_bnPosBnProcBuf_CNG9[j][i]*Z;
...@@ -1142,7 +1142,7 @@ static inline void nrLDPC_bn2cnProcBuf_BG1(t_nrLDPC_lut* p_lut, t_nrLDPC_procBuf ...@@ -1142,7 +1142,7 @@ static inline void nrLDPC_bn2cnProcBuf_BG1(t_nrLDPC_lut* p_lut, t_nrLDPC_procBuf
for (j=0; j<9; j++) for (j=0; j<9; j++)
{ {
p_cnProcBuf = &cnProcBuf[lut_startAddrCnGroups[7] + j*bitOffsetInGroup]; p_cnProcBuf = &cnProcBuf[lut_startAddrCnGroups[7] + j*bitOffsetInGroup];
#pragma omp simd // #pragma omp simd
for (i=0; i<lut_numCnInCnGroups[7]; i++) for (i=0; i<lut_numCnInCnGroups[7]; i++)
{ {
idxBn = lut_startAddrBnProcBuf_CNG10[j][i] + lut_bnPosBnProcBuf_CNG10[j][i]*Z; idxBn = lut_startAddrBnProcBuf_CNG10[j][i] + lut_bnPosBnProcBuf_CNG10[j][i]*Z;
...@@ -1159,7 +1159,7 @@ static inline void nrLDPC_bn2cnProcBuf_BG1(t_nrLDPC_lut* p_lut, t_nrLDPC_procBuf ...@@ -1159,7 +1159,7 @@ static inline void nrLDPC_bn2cnProcBuf_BG1(t_nrLDPC_lut* p_lut, t_nrLDPC_procBuf
for (j=0; j<19; j++) for (j=0; j<19; j++)
{ {
p_cnProcBuf = &cnProcBuf[lut_startAddrCnGroups[8] + j*bitOffsetInGroup]; p_cnProcBuf = &cnProcBuf[lut_startAddrCnGroups[8] + j*bitOffsetInGroup];
#pragma omp simd // #pragma omp simd
for (i=0; i<lut_numCnInCnGroups[8]; i++) for (i=0; i<lut_numCnInCnGroups[8]; i++)
{ {
idxBn = lut_startAddrBnProcBuf_CNG19[j][i] + lut_bnPosBnProcBuf_CNG19[j][i]*Z; idxBn = lut_startAddrBnProcBuf_CNG19[j][i] + lut_bnPosBnProcBuf_CNG19[j][i]*Z;
......
#include <stdio.h>
#include<stdint.h>
#define NB_Z 51
void nrLDPC_bnProcPc_BG1_generator_AVX2(uint16_t,int);
void nrLDPC_bnProcPc_BG2_generator_AVX2(uint16_t,int);
void nrLDPC_bnProc_BG1_generator_AVX2(uint16_t,int);
void nrLDPC_bnProc_BG2_generator_AVX2(uint16_t,int);
#include <stdio.h>
#include <stdint.h>
#define NB_R 3
void nrLDPC_bnProc_BG1_generator_AVX2(int);
void nrLDPC_bnProc_BG2_generator_AVX2(int);
void nrLDPC_bnProcPc_BG1_generator_AVX2(int);
void nrLDPC_bnProcPc_BG2_generator_AVX2(int);
int main() int main()
{ {
uint16_t Z[NB_Z]={2,3,4,5,6,7,8,9,10,11,12,13,14,15,16,18,20,22,24,26,28,30,32,36,40,44,48,52,56,60,64,72,80,88,96,104,112,120,128,144,160,176,192,208,224,240,256,288,320,352,384}; int R[NB_R]={0,1,2};
for(int i=0; i<NB_R;i++){
for(int i=0; i<NB_Z;i++){
//bnProcPc nrLDPC_bnProc_BG1_generator_AVX2(R[i]);
nrLDPC_bnProcPc_BG1_generator_AVX2(Z[i], 0); nrLDPC_bnProc_BG2_generator_AVX2(R[i]);
nrLDPC_bnProcPc_BG2_generator_AVX2(Z[i],0);
//bnProc nrLDPC_bnProcPc_BG1_generator_AVX2(R[i]);
nrLDPC_bnProc_BG1_generator_AVX2(Z[i],0); nrLDPC_bnProcPc_BG2_generator_AVX2(R[i]);
nrLDPC_bnProc_BG2_generator_AVX2(Z[i],0);
}
}
return(0); return(0);
} }
...@@ -5,7 +5,7 @@ ...@@ -5,7 +5,7 @@
#include "../../nrLDPC_bnProc.h" #include "../../nrLDPC_bnProc.h"
void nrLDPC_cnProc_BG2_generator_AVX2(uint16_t Z,int R) void nrLDPC_cnProc_BG2_generator_AVX2(int R)
{ {
const char *ratestr[3]={"15","13","23"}; const char *ratestr[3]={"15","13","23"};
...@@ -15,13 +15,13 @@ void nrLDPC_cnProc_BG2_generator_AVX2(uint16_t Z,int R) ...@@ -15,13 +15,13 @@ void nrLDPC_cnProc_BG2_generator_AVX2(uint16_t Z,int R)
// system("mkdir -p ldpc_gen_files/avx2"); // system("mkdir -p ldpc_gen_files/avx2");
char fname[50]; char fname[50];
sprintf(fname,"../ldpc_gen_files/cnProc/nrLDPC_cnProc_BG2_Z%d_R%s_AVX2.c",Z,ratestr[R]); sprintf(fname,"../ldpc_gen_files/cnProc/nrLDPC_cnProc_BG2_R%s_AVX2.h",ratestr[R]);
FILE *fd=fopen(fname,"w"); FILE *fd=fopen(fname,"w");
if (fd == NULL) {printf("Cannot create \n");abort();} if (fd == NULL) {printf("Cannot create \n");abort();}
fprintf(fd,"#include <stdint.h>\n"); fprintf(fd,"#include <stdint.h>\n");
fprintf(fd,"#include <immintrin.h>\n"); fprintf(fd,"#include <immintrin.h>\n");
fprintf(fd,"void nrLDPC_cnProc_BG2_Z%d_R%s_AVX2(int8_t* cnProcBuf,int8_t* cnProcBufRes) {\n",Z,ratestr[R]); fprintf(fd,"static inline void nrLDPC_cnProc_BG2_R%s_AVX2(int8_t* cnProcBuf, int8_t* cnProcBufRes, uint16_t Z) {\n",ratestr[R]);
const uint8_t* lut_numCnInCnGroups; const uint8_t* lut_numCnInCnGroups;
const uint32_t* lut_startAddrCnGroups = lut_startAddrCnGroups_BG2; const uint32_t* lut_startAddrCnGroups = lut_startAddrCnGroups_BG2;
...@@ -33,7 +33,7 @@ void nrLDPC_cnProc_BG2_generator_AVX2(uint16_t Z,int R) ...@@ -33,7 +33,7 @@ void nrLDPC_cnProc_BG2_generator_AVX2(uint16_t Z,int R)
// Number of CNs in Groups // Number of CNs in Groups
uint32_t M; //uint32_t M;
uint32_t j; uint32_t j;
uint32_t k; uint32_t k;
// Offset to each bit within a group in terms of 32 byte // Offset to each bit within a group in terms of 32 byte
...@@ -54,12 +54,14 @@ void nrLDPC_cnProc_BG2_generator_AVX2(uint16_t Z,int R) ...@@ -54,12 +54,14 @@ void nrLDPC_cnProc_BG2_generator_AVX2(uint16_t Z,int R)
fprintf(fd," __m256i ymm0, min, sgn,ones,maxLLR;\n"); fprintf(fd," __m256i ymm0, min, sgn,ones,maxLLR;\n");
fprintf(fd," ones = _mm256_set1_epi8((char)1);\n"); fprintf(fd," ones = _mm256_set1_epi8((char)1);\n");
fprintf(fd," maxLLR = _mm256_set1_epi8((char)127);\n"); fprintf(fd," maxLLR = _mm256_set1_epi8((char)127);\n");
fprintf(fd," uint32_t M;\n");
if (lut_numCnInCnGroups[0] > 0) if (lut_numCnInCnGroups[0] > 0)
{ {
// Number of groups of 32 CNs for parallel processing // Number of groups of 32 CNs for parallel processing
// Ceil for values not divisible by 32 // Ceil for values not divisible by 32
M = (lut_numCnInCnGroups[0]*Z + 31)>>5; fprintf(fd," M = (%d*Z + 31)>>5;\n",lut_numCnInCnGroups[0] );
// Set the offset to each bit within a group in terms of 32 byte // Set the offset to each bit within a group in terms of 32 byte
bitOffsetInGroup = (lut_numCnInCnGroups_BG2_R15[0]*NR_LDPC_ZMAX)>>5; bitOffsetInGroup = (lut_numCnInCnGroups_BG2_R15[0]*NR_LDPC_ZMAX)>>5;
...@@ -69,7 +71,7 @@ void nrLDPC_cnProc_BG2_generator_AVX2(uint16_t Z,int R) ...@@ -69,7 +71,7 @@ void nrLDPC_cnProc_BG2_generator_AVX2(uint16_t Z,int R)
for (j=0; j<3; j++) for (j=0; j<3; j++)
{ {
fprintf(fd," for (int i=0;i<%d;i+=2) {\n",M); fprintf(fd," for (int i=0;i<M;i+=2) {\n");
// Abs and sign of 32 CNs (first BN) // Abs and sign of 32 CNs (first BN)
// ymm0 = p_cnProcBuf[lut_idxCnProcG3[j][0] + i]; // ymm0 = p_cnProcBuf[lut_idxCnProcG3[j][0] + i];
fprintf(fd," ymm0 = ((__m256i*)cnProcBuf)[%d+i];\n",(lut_startAddrCnGroups[0]>>5)+lut_idxCnProcG3[j][0]); fprintf(fd," ymm0 = ((__m256i*)cnProcBuf)[%d+i];\n",(lut_startAddrCnGroups[0]>>5)+lut_idxCnProcG3[j][0]);
...@@ -103,23 +105,6 @@ void nrLDPC_cnProc_BG2_generator_AVX2(uint16_t Z,int R) ...@@ -103,23 +105,6 @@ void nrLDPC_cnProc_BG2_generator_AVX2(uint16_t Z,int R)
// min = _mm256_abs_epi8(ymm0); // min = _mm256_abs_epi8(ymm0);
fprintf(fd," min = _mm256_abs_epi8(ymm0);\n"); fprintf(fd," min = _mm256_abs_epi8(ymm0);\n");
// 32 CNs of second BN
// ymm0 = p_cnProcBuf[lut_idxCnProcG3[j][1] + i];
fprintf(fd," ymm0 = ((__m256i*)cnProcBuf)[%d+i];\n",(lut_startAddrCnGroups[0]>>5)+lut_idxCnProcG3[j][1]+1);
// min = _mm256_min_epu8(min, _mm256_abs_epi8(ymm0));
fprintf(fd," min = _mm256_min_epu8(min, _mm256_abs_epi8(ymm0));\n");
// sgn = _mm256_sign_epi8(sgn, ymm0);
fprintf(fd," sgn = _mm256_sign_epi8(sgn, ymm0);\n");
// Store result
// min = _mm256_min_epu8(min, maxLLR); // 128 in epi8 is -127
fprintf(fd," min = _mm256_min_epu8(min, maxLLR);\n");
// *p_cnProcBufResBit = _mm256_sign_epi8(min, sgn);
// p_cnProcBufResBit++;
fprintf(fd," ((__m256i*)cnProcBufRes)[%d+i] = _mm256_sign_epi8(min, sgn);\n",(lut_startAddrCnGroups[0]>>5)+(j*bitOffsetInGroup)+1);
fprintf(fd," }\n"); fprintf(fd," }\n");
} }
} }
...@@ -135,7 +120,7 @@ void nrLDPC_cnProc_BG2_generator_AVX2(uint16_t Z,int R) ...@@ -135,7 +120,7 @@ void nrLDPC_cnProc_BG2_generator_AVX2(uint16_t Z,int R)
{ {
// Number of groups of 32 CNs for parallel processing // Number of groups of 32 CNs for parallel processing
// Ceil for values not divisible by 32 // Ceil for values not divisible by 32
M = (lut_numCnInCnGroups[1]*Z + 31)>>5; fprintf(fd," M = (%d*Z + 31)>>5;\n",lut_numCnInCnGroups[1] );
// Set the offset to each bit within a group in terms of 32 byte // Set the offset to each bit within a group in terms of 32 byte
bitOffsetInGroup = (lut_numCnInCnGroups_BG2_R15[1]*NR_LDPC_ZMAX)>>5; bitOffsetInGroup = (lut_numCnInCnGroups_BG2_R15[1]*NR_LDPC_ZMAX)>>5;
...@@ -147,7 +132,7 @@ void nrLDPC_cnProc_BG2_generator_AVX2(uint16_t Z,int R) ...@@ -147,7 +132,7 @@ void nrLDPC_cnProc_BG2_generator_AVX2(uint16_t Z,int R)
// Loop over CNs // Loop over CNs
fprintf(fd," for (int i=0;i<%d;i++) {\n",M); fprintf(fd," for (int i=0;i<M;i++) {\n");
// Abs and sign of 32 CNs (first BN) // Abs and sign of 32 CNs (first BN)
// ymm0 = p_cnProcBuf[lut_idxCnProcG3[j][0] + i]; // ymm0 = p_cnProcBuf[lut_idxCnProcG3[j][0] + i];
fprintf(fd," ymm0 = ((__m256i*)cnProcBuf)[%d+i];\n",(lut_startAddrCnGroups[1]>>5)+lut_idxCnProcG4[j][0]); fprintf(fd," ymm0 = ((__m256i*)cnProcBuf)[%d+i];\n",(lut_startAddrCnGroups[1]>>5)+lut_idxCnProcG4[j][0]);
...@@ -194,8 +179,7 @@ void nrLDPC_cnProc_BG2_generator_AVX2(uint16_t Z,int R) ...@@ -194,8 +179,7 @@ void nrLDPC_cnProc_BG2_generator_AVX2(uint16_t Z,int R)
{ {
// Number of groups of 32 CNs for parallel processing // Number of groups of 32 CNs for parallel processing
// Ceil for values not divisible by 32 // Ceil for values not divisible by 32
M = (lut_numCnInCnGroups[2]*Z + 31)>>5; fprintf(fd," M = (%d*Z + 31)>>5;\n",lut_numCnInCnGroups[2] );
// Set the offset to each bit within a group in terms of 32 byte // Set the offset to each bit within a group in terms of 32 byte
bitOffsetInGroup = (lut_numCnInCnGroups_BG2_R15[2]*NR_LDPC_ZMAX)>>5; bitOffsetInGroup = (lut_numCnInCnGroups_BG2_R15[2]*NR_LDPC_ZMAX)>>5;
...@@ -204,7 +188,8 @@ void nrLDPC_cnProc_BG2_generator_AVX2(uint16_t Z,int R) ...@@ -204,7 +188,8 @@ void nrLDPC_cnProc_BG2_generator_AVX2(uint16_t Z,int R)
for (j=0; j<5; j++) for (j=0; j<5; j++)
{ {
fprintf(fd," for (int i=0;i<%d;i++) {\n",M);
fprintf(fd," for (int i=0;i<M;i++) {\n");
// Abs and sign of 32 CNs (first BN) // Abs and sign of 32 CNs (first BN)
// ymm0 = p_cnProcBuf[lut_idxCnProcG3[j][0] + i]; // ymm0 = p_cnProcBuf[lut_idxCnProcG3[j][0] + i];
fprintf(fd," ymm0 = ((__m256i*)cnProcBuf)[%d+i];\n",(lut_startAddrCnGroups[2]>>5)+lut_idxCnProcG5[j][0]); fprintf(fd," ymm0 = ((__m256i*)cnProcBuf)[%d+i];\n",(lut_startAddrCnGroups[2]>>5)+lut_idxCnProcG5[j][0]);
...@@ -248,7 +233,7 @@ void nrLDPC_cnProc_BG2_generator_AVX2(uint16_t Z,int R) ...@@ -248,7 +233,7 @@ void nrLDPC_cnProc_BG2_generator_AVX2(uint16_t Z,int R)
{ {
// Number of groups of 32 CNs for parallel processing // Number of groups of 32 CNs for parallel processing
// Ceil for values not divisible by 32 // Ceil for values not divisible by 32
M = (lut_numCnInCnGroups[3]*Z + 31)>>5; fprintf(fd, "M = (%d*Z + 31)>>5;\n",lut_numCnInCnGroups[3] );
// Set the offset to each bit within a group in terms of 32 byte // Set the offset to each bit within a group in terms of 32 byte
bitOffsetInGroup = (lut_numCnInCnGroups_BG2_R15[3]*NR_LDPC_ZMAX)>>5; bitOffsetInGroup = (lut_numCnInCnGroups_BG2_R15[3]*NR_LDPC_ZMAX)>>5;
...@@ -261,7 +246,7 @@ void nrLDPC_cnProc_BG2_generator_AVX2(uint16_t Z,int R) ...@@ -261,7 +246,7 @@ void nrLDPC_cnProc_BG2_generator_AVX2(uint16_t Z,int R)
// Loop over CNs // Loop over CNs
fprintf(fd," for (int i=0;i<%d;i++) {\n",M); fprintf(fd," for (int i=0;i<M;i++) {\n");
// Abs and sign of 32 CNs (first BN) // Abs and sign of 32 CNs (first BN)
// ymm0 = p_cnProcBuf[lut_idxCnProcG3[j][0] + i]; // ymm0 = p_cnProcBuf[lut_idxCnProcG3[j][0] + i];
fprintf(fd," ymm0 = ((__m256i*)cnProcBuf)[%d+i];\n",(lut_startAddrCnGroups[3]>>5)+lut_idxCnProcG6[j][0]); fprintf(fd," ymm0 = ((__m256i*)cnProcBuf)[%d+i];\n",(lut_startAddrCnGroups[3]>>5)+lut_idxCnProcG6[j][0]);
...@@ -313,7 +298,7 @@ void nrLDPC_cnProc_BG2_generator_AVX2(uint16_t Z,int R) ...@@ -313,7 +298,7 @@ void nrLDPC_cnProc_BG2_generator_AVX2(uint16_t Z,int R)
{ {
// Number of groups of 32 CNs for parallel processing // Number of groups of 32 CNs for parallel processing
// Ceil for values not divisible by 32 // Ceil for values not divisible by 32
M = (lut_numCnInCnGroups[4]*Z + 31)>>5; fprintf(fd, "M = (%d*Z + 31)>>5;\n",lut_numCnInCnGroups[4] );
// Set the offset to each bit within a group in terms of 32 byte // Set the offset to each bit within a group in terms of 32 byte
bitOffsetInGroup = (lut_numCnInCnGroups_BG2_R15[4]*NR_LDPC_ZMAX)>>5; bitOffsetInGroup = (lut_numCnInCnGroups_BG2_R15[4]*NR_LDPC_ZMAX)>>5;
...@@ -324,7 +309,7 @@ void nrLDPC_cnProc_BG2_generator_AVX2(uint16_t Z,int R) ...@@ -324,7 +309,7 @@ void nrLDPC_cnProc_BG2_generator_AVX2(uint16_t Z,int R)
{ {
// Loop over CNs // Loop over CNs
fprintf(fd," for (int i=0;i<%d;i++) {\n",M); fprintf(fd," for (int i=0;i<M;i++) {\n");
// Abs and sign of 32 CNs (first BN) // Abs and sign of 32 CNs (first BN)
// ymm0 = p_cnProcBuf[lut_idxCnProcG3[j][0] + i]; // ymm0 = p_cnProcBuf[lut_idxCnProcG3[j][0] + i];
fprintf(fd," ymm0 = ((__m256i*)cnProcBuf)[%d+i];\n",(lut_startAddrCnGroups[4]>>5)+lut_idxCnProcG8[j][0]); fprintf(fd," ymm0 = ((__m256i*)cnProcBuf)[%d+i];\n",(lut_startAddrCnGroups[4]>>5)+lut_idxCnProcG8[j][0]);
...@@ -375,7 +360,7 @@ void nrLDPC_cnProc_BG2_generator_AVX2(uint16_t Z,int R) ...@@ -375,7 +360,7 @@ void nrLDPC_cnProc_BG2_generator_AVX2(uint16_t Z,int R)
{ {
// Number of groups of 32 CNs for parallel processing // Number of groups of 32 CNs for parallel processing
// Ceil for values not divisible by 32 // Ceil for values not divisible by 32
M = (lut_numCnInCnGroups[5]*Z + 31)>>5; fprintf(fd, "M = (%d*Z + 31)>>5;\n",lut_numCnInCnGroups[5] );
// Set the offset to each bit within a group in terms of 32 byte // Set the offset to each bit within a group in terms of 32 byte
bitOffsetInGroup = (lut_numCnInCnGroups_BG2_R15[5]*NR_LDPC_ZMAX)>>5; bitOffsetInGroup = (lut_numCnInCnGroups_BG2_R15[5]*NR_LDPC_ZMAX)>>5;
...@@ -387,7 +372,7 @@ void nrLDPC_cnProc_BG2_generator_AVX2(uint16_t Z,int R) ...@@ -387,7 +372,7 @@ void nrLDPC_cnProc_BG2_generator_AVX2(uint16_t Z,int R)
// Loop over CNs // Loop over CNs
fprintf(fd," for (int i=0;i<%d;i++) {\n",M); fprintf(fd," for (int i=0;i<M;i++) {\n");
// Abs and sign of 32 CNs (first BN) // Abs and sign of 32 CNs (first BN)
// ymm0 = p_cnProcBuf[lut_idxCnProcG3[j][0] + i]; // ymm0 = p_cnProcBuf[lut_idxCnProcG3[j][0] + i];
fprintf(fd," ymm0 = ((__m256i*)cnProcBuf)[%d+i];\n",(lut_startAddrCnGroups[5]>>5)+lut_idxCnProcG10[j][0]); fprintf(fd," ymm0 = ((__m256i*)cnProcBuf)[%d+i];\n",(lut_startAddrCnGroups[5]>>5)+lut_idxCnProcG10[j][0]);
...@@ -423,3 +408,4 @@ void nrLDPC_cnProc_BG2_generator_AVX2(uint16_t Z,int R) ...@@ -423,3 +408,4 @@ void nrLDPC_cnProc_BG2_generator_AVX2(uint16_t Z,int R)
fprintf(fd,"}\n"); fprintf(fd,"}\n");
fclose(fd); fclose(fd);
}//end of the function nrLDPC_cnProc_BG2 }//end of the function nrLDPC_cnProc_BG2
#include <stdio.h> #include <stdio.h>
#include <stdint.h> #include <stdint.h>
#define NB_Z 51 #define NB_R 3
void nrLDPC_cnProc_BG1_generator_AVX2(uint16_t,int); void nrLDPC_cnProc_BG1_generator_AVX2(int);
void nrLDPC_cnProc_BG2_generator_AVX2(uint16_t,int); void nrLDPC_cnProc_BG2_generator_AVX2(int);
int main() int main()
{ {
uint16_t Z[NB_Z]={2,3,4,5,6,7,8,9,10,11,12,13,14,15,16,18,20,22,24,26,28,30,32,36,40,44,48,52,56,60,64,72,80,88,96,104,112,120,128,144,160,176,192,208,224,240,256,288,320,352,384}; int R[NB_R]={0,1,2};
for(int i=0; i<NB_R;i++){
for(int i=0; i<NB_Z;i++){ nrLDPC_cnProc_BG1_generator_AVX2(R[i]);
nrLDPC_cnProc_BG2_generator_AVX2(R[i]);
nrLDPC_cnProc_BG1_generator_AVX2(Z[i],0); }
nrLDPC_cnProc_BG2_generator_AVX2(Z[i],0);
}
return(0); return(0);
} }
...@@ -8,7 +8,7 @@ OBJ= $(SRC:.c=.o) ...@@ -8,7 +8,7 @@ OBJ= $(SRC:.c=.o)
all: $(EXEC) all: $(EXEC)
cnProc_gen_avx512: $(OBJ) cnProc_gen_avx512: $(OBJ)
@$(CC) -o $@ $^ $(LDFLAGS) -O2 @$(CC) -o $@ $^ $(LDFLAGS) -O3
#main.o: cnProc_gen_avx512.h #main.o: cnProc_gen_avx512.h
......
...@@ -3,7 +3,7 @@ ...@@ -3,7 +3,7 @@
#include <stdint.h> #include <stdint.h>
#include "../../nrLDPCdecoder_defs.h" #include "../../nrLDPCdecoder_defs.h"
void nrLDPC_cnProc_BG2_generator_AVX512(uint16_t Z,int R) void nrLDPC_cnProc_BG2_generator_AVX512(int R)
{ {
const char *ratestr[3]={"15","13","23"}; const char *ratestr[3]={"15","13","23"};
...@@ -13,19 +13,19 @@ void nrLDPC_cnProc_BG2_generator_AVX512(uint16_t Z,int R) ...@@ -13,19 +13,19 @@ void nrLDPC_cnProc_BG2_generator_AVX512(uint16_t Z,int R)
// system("mkdir -p ../ldpc_gen_files"); // system("mkdir -p ../ldpc_gen_files");
char fname[50]; char fname[50];
sprintf(fname,"../ldpc_gen_files/nrLDPC_cnProc_BG2_Z%d_R%s_AVX512.c",Z,ratestr[R]); sprintf(fname,"../ldpc_gen_files/cnProc_avx512/nrLDPC_cnProc_BG2_R%s_AVX512.h",ratestr[R]);
FILE *fd=fopen(fname,"w"); FILE *fd=fopen(fname,"w");
if (fd == NULL) {printf("Cannot create \n");abort();} if (fd == NULL) {printf("Cannot create \n");abort();}
fprintf(fd,"#include <stdint.h>\n"); //fprintf(fd,"#include <stdint.h>\n");
fprintf(fd,"#include <immintrin.h>\n"); // fprintf(fd,"#include <immintrin.h>\n");
fprintf(fd, "#define conditional_negate(a,b,z) _mm512_mask_sub_epi8(a,_mm512_movepi8_mask(b),z,a)\n"); fprintf(fd, "#define conditional_negate(a,b,z) _mm512_mask_sub_epi8(a,_mm512_movepi8_mask(b),z,a)\n");
fprintf(fd,"void nrLDPC_cnProc_BG2_Z%d_R%s_AVX512(int8_t* cnProcBuf,int8_t* cnProcBufRes) {\n",Z,ratestr[R]);
fprintf(fd,"static inline void nrLDPC_cnProc_BG2_R%s_AVX512(int8_t* cnProcBuf, int8_t* cnProcBufRes, uint16_t Z) {\n",ratestr[R]);
const uint8_t* lut_numCnInCnGroups; const uint8_t* lut_numCnInCnGroups;
const uint32_t* lut_startAddrCnGroups = lut_startAddrCnGroups_BG2; const uint32_t* lut_startAddrCnGroups = lut_startAddrCnGroups_BG2;
...@@ -36,13 +36,16 @@ void nrLDPC_cnProc_BG2_generator_AVX512(uint16_t Z,int R) ...@@ -36,13 +36,16 @@ void nrLDPC_cnProc_BG2_generator_AVX512(uint16_t Z,int R)
// Number of CNs in Groups // Number of CNs in Groups
uint32_t M; //uint32_t M;
uint32_t j; uint32_t j;
uint32_t k; uint32_t k;
// Offset to each bit within a group in terms of 64 Byte // Offset to each bit within a group in terms of 64 Byte
uint32_t bitOffsetInGroup; uint32_t bitOffsetInGroup;
fprintf(fd," uint32_t M;\n");
fprintf(fd," __m512i zmm0, min, sgn,zeros,maxLLR;\n");
fprintf(fd," zeros = _mm512_setzero_si512();\n");
fprintf(fd," maxLLR = _mm512_set1_epi8((char)127);\n");
// ===================================================================== // =====================================================================
// Process group with 3 BNs // Process group with 3 BNs
fprintf(fd,"//Process group with 3 BNs\n"); fprintf(fd,"//Process group with 3 BNs\n");
...@@ -52,15 +55,11 @@ void nrLDPC_cnProc_BG2_generator_AVX512(uint16_t Z,int R) ...@@ -52,15 +55,11 @@ void nrLDPC_cnProc_BG2_generator_AVX512(uint16_t Z,int R)
const uint8_t lut_idxCnProcG3[3][2] = {{72,144}, {0,144}, {0,72}}; const uint8_t lut_idxCnProcG3[3][2] = {{72,144}, {0,144}, {0,72}};
fprintf(fd," __m512i zmm0, min, sgn,zeros,maxLLR;\n");
fprintf(fd," zeros = _mm512_setzero_si512();\n");
fprintf(fd," maxLLR = _mm512_set1_epi8((char)127);\n");
if (lut_numCnInCnGroups[0] > 0) if (lut_numCnInCnGroups[0] > 0)
{ {
// Number of groups of 64 CNs for parallel processing // Number of groups of 64 CNs for parallel processing
// Ceil for values not divisible by 64 // Ceil for values not divisible by 64
M = (lut_numCnInCnGroups[0]*Z + 63)>>6; fprintf(fd," M = (%d*Z + 63)>>6;\n",lut_numCnInCnGroups[1] );
// Set the offset to each bit within a group in terms of 64 Byte // Set the offset to each bit within a group in terms of 64 Byte
bitOffsetInGroup = (lut_numCnInCnGroups_BG2_R15[0]*NR_LDPC_ZMAX)>>6; bitOffsetInGroup = (lut_numCnInCnGroups_BG2_R15[0]*NR_LDPC_ZMAX)>>6;
...@@ -72,13 +71,13 @@ void nrLDPC_cnProc_BG2_generator_AVX512(uint16_t Z,int R) ...@@ -72,13 +71,13 @@ void nrLDPC_cnProc_BG2_generator_AVX512(uint16_t Z,int R)
{ {
fprintf(fd," for (int i=0;i<%d;i+=2) {\n",M); fprintf(fd," for (int i=0;i<M;i+=2) {\n");
// Abs and sign of 64 CNs (first BN) // Abs and sign of 64 CNs (first BN)
// zmm0 = p_cnProcBuf[lut_idxCnProcG3[j][0] + i]; // zmm0 = p_cnProcBuf[lut_idxCnProcG3[j][0] + i];
fprintf(fd," sgn = ((__m512i*)cnProcBuf)[%d+i];\n",(lut_startAddrCnGroups[0]>>6)+lut_idxCnProcG3[j][0]/2); fprintf(fd," zmm0 = ((__m512i*)cnProcBuf)[%d+i];\n",(lut_startAddrCnGroups[0]>>6)+lut_idxCnProcG3[j][0]/2);
// sgn = _mm512_sign_epi8(ones, zmm0); // sgn = _mm512_sign_epi8(ones, zmm0);
// min = _mm512_abs_epi8(zmm0); // min = _mm512_abs_epi8(zmm0);
fprintf(fd," min = _mm512_abs_epi8(sgn);\n"); fprintf(fd," min = _mm512_abs_epi8(zmm0);\n");
// 32 CNs of second BN // 32 CNs of second BN
// zmm0 = p_cnProcBuf[lut_idxCnProcG3[j][1] + i]; // zmm0 = p_cnProcBuf[lut_idxCnProcG3[j][1] + i];
...@@ -134,7 +133,7 @@ void nrLDPC_cnProc_BG2_generator_AVX512(uint16_t Z,int R) ...@@ -134,7 +133,7 @@ void nrLDPC_cnProc_BG2_generator_AVX512(uint16_t Z,int R)
{ {
// Number of groups of 64 CNs for parallel processing // Number of groups of 64 CNs for parallel processing
// Ceil for values not divisible by 64 // Ceil for values not divisible by 64
M = (lut_numCnInCnGroups[1]*Z + 63)>>6; fprintf(fd," M = (%d*Z + 63)>>6;\n",lut_numCnInCnGroups[1] );
// Set the offset to each bit within a group in terms of 64 Byte // Set the offset to each bit within a group in terms of 64 Byte
bitOffsetInGroup = (lut_numCnInCnGroups_BG2_R15[1]*NR_LDPC_ZMAX)>>6; bitOffsetInGroup = (lut_numCnInCnGroups_BG2_R15[1]*NR_LDPC_ZMAX)>>6;
...@@ -142,7 +141,7 @@ void nrLDPC_cnProc_BG2_generator_AVX512(uint16_t Z,int R) ...@@ -142,7 +141,7 @@ void nrLDPC_cnProc_BG2_generator_AVX512(uint16_t Z,int R)
// Loop over every BN // Loop over every BN
for (j=0; j<4; j++) for (j=0; j<4; j++)
{ {
fprintf(fd," for (int i=0;i<%d;i++) {\n",M); fprintf(fd," for (int i=0;i<M;i++) {\n");
// Abs and sign of 64 CNs (first BN) // Abs and sign of 64 CNs (first BN)
// zmm0 = p_cnProcBuf[lut_idxCnProcG3[j][0] + i]; // zmm0 = p_cnProcBuf[lut_idxCnProcG3[j][0] + i];
fprintf(fd," sgn = ((__m512i*)cnProcBuf)[%d+i];\n",(lut_startAddrCnGroups[1]>>6)+lut_idxCnProcG4[j][0]/2); fprintf(fd," sgn = ((__m512i*)cnProcBuf)[%d+i];\n",(lut_startAddrCnGroups[1]>>6)+lut_idxCnProcG4[j][0]/2);
...@@ -187,7 +186,7 @@ void nrLDPC_cnProc_BG2_generator_AVX512(uint16_t Z,int R) ...@@ -187,7 +186,7 @@ void nrLDPC_cnProc_BG2_generator_AVX512(uint16_t Z,int R)
{ {
// Number of groups of 64 CNs for parallel processing // Number of groups of 64 CNs for parallel processing
// Ceil for values not divisible by 64 // Ceil for values not divisible by 64
M = (lut_numCnInCnGroups[2]*Z + 63)>>6; fprintf(fd," M = (%d*Z + 63)>>6;\n",lut_numCnInCnGroups[2] );
// Set the offset to each bit within a group in terms of 64 Byte // Set the offset to each bit within a group in terms of 64 Byte
bitOffsetInGroup = (lut_numCnInCnGroups_BG2_R15[2]*NR_LDPC_ZMAX)>>6; bitOffsetInGroup = (lut_numCnInCnGroups_BG2_R15[2]*NR_LDPC_ZMAX)>>6;
...@@ -197,7 +196,7 @@ void nrLDPC_cnProc_BG2_generator_AVX512(uint16_t Z,int R) ...@@ -197,7 +196,7 @@ void nrLDPC_cnProc_BG2_generator_AVX512(uint16_t Z,int R)
for (j=0; j<5; j++) for (j=0; j<5; j++)
{ {
fprintf(fd," for (int i=0;i<%d;i++) {\n",M); fprintf(fd," for (int i=0;i<M;i++) {\n");
// Abs and sign of 64 CNs (first BN) // Abs and sign of 64 CNs (first BN)
// zmm0 = p_cnProcBuf[lut_idxCnProcG3[j][0] + i]; // zmm0 = p_cnProcBuf[lut_idxCnProcG3[j][0] + i];
fprintf(fd," sgn = ((__m512i*)cnProcBuf)[%d+i];\n",(lut_startAddrCnGroups[2]>>6)+lut_idxCnProcG5[j][0]/2); fprintf(fd," sgn = ((__m512i*)cnProcBuf)[%d+i];\n",(lut_startAddrCnGroups[2]>>6)+lut_idxCnProcG5[j][0]/2);
...@@ -241,7 +240,7 @@ void nrLDPC_cnProc_BG2_generator_AVX512(uint16_t Z,int R) ...@@ -241,7 +240,7 @@ void nrLDPC_cnProc_BG2_generator_AVX512(uint16_t Z,int R)
{ {
// Number of groups of 64 CNs for parallel processing // Number of groups of 64 CNs for parallel processing
// Ceil for values not divisible by 64 // Ceil for values not divisible by 64
M = (lut_numCnInCnGroups[3]*Z + 63)>>6; fprintf(fd," M = (%d*Z + 63)>>6;\n",lut_numCnInCnGroups[3] );
// Set the offset to each bit within a group in terms of 64 Byte // Set the offset to each bit within a group in terms of 64 Byte
bitOffsetInGroup = (lut_numCnInCnGroups_BG2_R15[3]*NR_LDPC_ZMAX)>>6; bitOffsetInGroup = (lut_numCnInCnGroups_BG2_R15[3]*NR_LDPC_ZMAX)>>6;
...@@ -251,7 +250,7 @@ void nrLDPC_cnProc_BG2_generator_AVX512(uint16_t Z,int R) ...@@ -251,7 +250,7 @@ void nrLDPC_cnProc_BG2_generator_AVX512(uint16_t Z,int R)
for (j=0; j<6; j++) for (j=0; j<6; j++)
{ {
fprintf(fd," for (int i=0;i<%d;i++) {\n",M); fprintf(fd," for (int i=0;i<M;i++) {\n");
// Abs and sign of 64 CNs (first BN) // Abs and sign of 64 CNs (first BN)
// zmm0 = p_cnProcBuf[lut_idxCnProcG3[j][0] + i]; // zmm0 = p_cnProcBuf[lut_idxCnProcG3[j][0] + i];
fprintf(fd," sgn = ((__m512i*)cnProcBuf)[%d+i];\n",(lut_startAddrCnGroups[3]>>6)+lut_idxCnProcG6[j][0]/2); fprintf(fd," sgn = ((__m512i*)cnProcBuf)[%d+i];\n",(lut_startAddrCnGroups[3]>>6)+lut_idxCnProcG6[j][0]/2);
...@@ -297,7 +296,7 @@ void nrLDPC_cnProc_BG2_generator_AVX512(uint16_t Z,int R) ...@@ -297,7 +296,7 @@ void nrLDPC_cnProc_BG2_generator_AVX512(uint16_t Z,int R)
{ {
// Number of groups of 64 CNs for parallel processing // Number of groups of 64 CNs for parallel processing
// Ceil for values not divisible by 64 // Ceil for values not divisible by 64
M = (lut_numCnInCnGroups[4]*Z + 63)>>6; fprintf(fd," M = (%d*Z + 63)>>6;\n",lut_numCnInCnGroups[4] );
// Set the offset to each bit within a group in terms of 64 Byte // Set the offset to each bit within a group in terms of 64 Byte
bitOffsetInGroup = (lut_numCnInCnGroups_BG2_R15[4]*NR_LDPC_ZMAX)>>6; bitOffsetInGroup = (lut_numCnInCnGroups_BG2_R15[4]*NR_LDPC_ZMAX)>>6;
...@@ -307,7 +306,7 @@ void nrLDPC_cnProc_BG2_generator_AVX512(uint16_t Z,int R) ...@@ -307,7 +306,7 @@ void nrLDPC_cnProc_BG2_generator_AVX512(uint16_t Z,int R)
for (j=0; j<8; j++) for (j=0; j<8; j++)
{ {
fprintf(fd," for (int i=0;i<%d;i++) {\n",M); fprintf(fd," for (int i=0;i<M;i++) {\n");
// Abs and sign of 64 CNs (first BN) // Abs and sign of 64 CNs (first BN)
// zmm0 = p_cnProcBuf[lut_idxCnProcG3[j][0] + i]; // zmm0 = p_cnProcBuf[lut_idxCnProcG3[j][0] + i];
fprintf(fd," sgn = ((__m512i*)cnProcBuf)[%d+i];\n",(lut_startAddrCnGroups[4]>>6)+lut_idxCnProcG8[j][0]/2); fprintf(fd," sgn = ((__m512i*)cnProcBuf)[%d+i];\n",(lut_startAddrCnGroups[4]>>6)+lut_idxCnProcG8[j][0]/2);
...@@ -353,7 +352,7 @@ void nrLDPC_cnProc_BG2_generator_AVX512(uint16_t Z,int R) ...@@ -353,7 +352,7 @@ void nrLDPC_cnProc_BG2_generator_AVX512(uint16_t Z,int R)
{ {
// Number of groups of 64 CNs for parallel processing // Number of groups of 64 CNs for parallel processing
// Ceil for values not divisible by 64 // Ceil for values not divisible by 64
M = (lut_numCnInCnGroups[5]*Z + 63)>>6; fprintf(fd," M = (%d*Z + 63)>>6;\n",lut_numCnInCnGroups[5] );
// Set the offset to each bit within a group in terms of 64 Byte // Set the offset to each bit within a group in terms of 64 Byte
bitOffsetInGroup = (lut_numCnInCnGroups_BG2_R15[5]*NR_LDPC_ZMAX)>>6; bitOffsetInGroup = (lut_numCnInCnGroups_BG2_R15[5]*NR_LDPC_ZMAX)>>6;
...@@ -363,7 +362,7 @@ void nrLDPC_cnProc_BG2_generator_AVX512(uint16_t Z,int R) ...@@ -363,7 +362,7 @@ void nrLDPC_cnProc_BG2_generator_AVX512(uint16_t Z,int R)
for (j=0; j<10; j++) for (j=0; j<10; j++)
{ {
fprintf(fd," for (int i=0;i<%d;i++) {\n",M); fprintf(fd," for (int i=0;i<M;i++) {\n");
// Abs and sign of 64 CNs (first BN) // Abs and sign of 64 CNs (first BN)
// zmm0 = p_cnProcBuf[lut_idxCnProcG3[j][0] + i]; // zmm0 = p_cnProcBuf[lut_idxCnProcG3[j][0] + i];
fprintf(fd," sgn = ((__m512i*)cnProcBuf)[%d+i];\n",(lut_startAddrCnGroups[5]>>6)+lut_idxCnProcG10[j][0]/2); fprintf(fd," sgn = ((__m512i*)cnProcBuf)[%d+i];\n",(lut_startAddrCnGroups[5]>>6)+lut_idxCnProcG10[j][0]/2);
...@@ -404,3 +403,4 @@ void nrLDPC_cnProc_BG2_generator_AVX512(uint16_t Z,int R) ...@@ -404,3 +403,4 @@ void nrLDPC_cnProc_BG2_generator_AVX512(uint16_t Z,int R)
#include <stdio.h> #include <stdio.h>
#include <stdint.h> #include <stdint.h>
#define NB_Z 51 #define NB_R 3
void nrLDPC_cnProc_BG1_generator_AVX512(uint16_t,int); void nrLDPC_cnProc_BG1_generator_AVX512(int);
void nrLDPC_cnProc_BG2_generator_AVX512(uint16_t,int); void nrLDPC_cnProc_BG2_generator_AVX512(int);
int main() int main()
{ {
uint16_t Z[NB_Z]={2,3,4,5,6,7,8,9,10,11,12,13,14,15,16,18,20,22,24,26,28,30,32,36,40,44,48,52,56,60,64,72,80,88,96,104,112,120,128,144,160,176,192,208,224,240,256,288,320,352,384}; int R[NB_R]={0,1,2};
for(int i=0; i<NB_R;i++){
nrLDPC_cnProc_BG1_generator_AVX512(R[i]);
nrLDPC_cnProc_BG2_generator_AVX512(R[i]);
for(int i=0; i<NB_Z;i++){ }
nrLDPC_cnProc_BG1_generator_AVX512(Z[i],0);
nrLDPC_cnProc_BG2_generator_AVX512(Z[i],0);
}
return(0); return(0);
} }
static inline void nrLDPC_bnProc_BG1_R89_AVX2(int8_t* bnProcBuf,int8_t* bnProcBufRes, int8_t* llrRes, uint16_t Z ) {
__m256i* p_bnProcBuf;
__m256i* p_bnProcBufRes;
__m256i* p_llrRes;
__m256i* p_res;
uint32_t M, i;
// Process group with 2 CNs
M = (3*Z + 31)>>5;
p_bnProcBuf = (__m256i*) &bnProcBuf [384];
p_bnProcBufRes = (__m256i*) &bnProcBufRes [384];
p_res = &p_bnProcBufRes[0];
p_llrRes = (__m256i*) &llrRes [384];
for (i=0;i<M;i++) {
*p_res = _mm256_subs_epi8(*p_llrRes, p_bnProcBuf[0 + i]);
p_res++;
p_llrRes++;
}
p_res = &p_bnProcBufRes[36];
p_llrRes = (__m256i*) &llrRes [384];
for (i=0;i<M;i++) {
*p_res = _mm256_subs_epi8(*p_llrRes, p_bnProcBuf[36 + i]);
p_res++;
p_llrRes++;
}
// Process group with 3 CNs
M = (21*Z + 31)>>5;
p_bnProcBuf = (__m256i*) &bnProcBuf [2688];
p_bnProcBufRes = (__m256i*) &bnProcBufRes [2688];
p_res = &p_bnProcBufRes[0];
p_llrRes = (__m256i*) &llrRes [1536];
for (i=0;i<M;i++) {
*p_res = _mm256_subs_epi8(*p_llrRes, p_bnProcBuf[0 + i]);
p_res++;
p_llrRes++;
}
p_res = &p_bnProcBufRes[252];
p_llrRes = (__m256i*) &llrRes [1536];
for (i=0;i<M;i++) {
*p_res = _mm256_subs_epi8(*p_llrRes, p_bnProcBuf[252 + i]);
p_res++;
p_llrRes++;
}
p_res = &p_bnProcBufRes[504];
p_llrRes = (__m256i*) &llrRes [1536];
for (i=0;i<M;i++) {
*p_res = _mm256_subs_epi8(*p_llrRes, p_bnProcBuf[504 + i]);
p_res++;
p_llrRes++;
}
// Process group with 4 CNs
M = (1*Z + 31)>>5;
p_bnProcBuf = (__m256i*) &bnProcBuf [26880];
p_bnProcBufRes = (__m256i*) &bnProcBufRes [26880];
p_res = &p_bnProcBufRes[0];
p_llrRes = (__m256i*) &llrRes [9600];
for (i=0;i<M;i++) {
*p_res = _mm256_subs_epi8(*p_llrRes, p_bnProcBuf[0 + i]);
p_res++;
p_llrRes++;
}
p_res = &p_bnProcBufRes[12];
p_llrRes = (__m256i*) &llrRes [9600];
for (i=0;i<M;i++) {
*p_res = _mm256_subs_epi8(*p_llrRes, p_bnProcBuf[12 + i]);
p_res++;
p_llrRes++;
}
p_res = &p_bnProcBufRes[24];
p_llrRes = (__m256i*) &llrRes [9600];
for (i=0;i<M;i++) {
*p_res = _mm256_subs_epi8(*p_llrRes, p_bnProcBuf[24 + i]);
p_res++;
p_llrRes++;
}
p_res = &p_bnProcBufRes[36];
p_llrRes = (__m256i*) &llrRes [9600];
for (i=0;i<M;i++) {
*p_res = _mm256_subs_epi8(*p_llrRes, p_bnProcBuf[36 + i]);
p_res++;
p_llrRes++;
}
// Process group with 5 CNs
M = (1*Z + 31)>>5;
p_bnProcBuf = (__m256i*) &bnProcBuf [28416];
p_bnProcBufRes = (__m256i*) &bnProcBufRes [28416];
p_res = &p_bnProcBufRes[0];
p_llrRes = (__m256i*) &llrRes [9984];
for (i=0;i<M;i++) {
*p_res = _mm256_subs_epi8(*p_llrRes, p_bnProcBuf[0 + i]);
p_res++;
p_llrRes++;
}
p_res = &p_bnProcBufRes[12];
p_llrRes = (__m256i*) &llrRes [9984];
for (i=0;i<M;i++) {
*p_res = _mm256_subs_epi8(*p_llrRes, p_bnProcBuf[12 + i]);
p_res++;
p_llrRes++;
}
p_res = &p_bnProcBufRes[24];
p_llrRes = (__m256i*) &llrRes [9984];
for (i=0;i<M;i++) {
*p_res = _mm256_subs_epi8(*p_llrRes, p_bnProcBuf[24 + i]);
p_res++;
p_llrRes++;
}
p_res = &p_bnProcBufRes[36];
p_llrRes = (__m256i*) &llrRes [9984];
for (i=0;i<M;i++) {
*p_res = _mm256_subs_epi8(*p_llrRes, p_bnProcBuf[36 + i]);
p_res++;
p_llrRes++;
}
p_res = &p_bnProcBufRes[48];
p_llrRes = (__m256i*) &llrRes [9984];
for (i=0;i<M;i++) {
*p_res = _mm256_subs_epi8(*p_llrRes, p_bnProcBuf[48 + i]);
p_res++;
p_llrRes++;
}
// Process group with 6 CNs
// Process group with 7 CNs
// Process group with 8 CNs
// Process group with 9 CNs
// Process group with 10 CNs
// Process group with 11 CNs
// Process group with 12 CNs
// Process group with 13 CNs
// Process group with 14 CNs
// Process group with 15 CNs
// Process group with 16 CNs
// Process group with 17 CNs
// Process group with 18 CNs
// Process group with 19 CNs
// Process group with 20 CNs
// Process group with 21 CNs
// Process group with 22 CNs
// Process group with <23 CNs
// Process group with 24 CNs
// Process group with 25 CNs
// Process group with 26 CNs
// Process group with 27 CNs
// Process group with 28 CNs
// Process group with 29 CNs
// Process group with 30 CNs
}
Markdown is supported
0%
or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment