Commit 5e5bc1a9 authored by Sy's avatar Sy

Effective management of all values ​​of Z (lifting size) and all R (coding rate) for BG1 & BG2 |

some improvement in decoding time | (bnProcPc reduces 2x)
parent c9127621
......@@ -61,9 +61,9 @@ static inline void nrLDPC_cnProc_BG2_AVX512(t_nrLDPC_lut* p_lut, t_nrLDPC_procBu
// Offset to each bit within a group in terms of 32 Byte
uint32_t bitOffsetInGroup;
__m512i zmm0, min, sgn, zeros,maxLLR;
__m512i zmm0, min, sgn, zeros;
zeros = _mm512_setzero_si512();
maxLLR = _mm512_set1_epi8((char)127);
// maxLLR = _mm512_set1_epi8((char)127);
__m512i* p_cnProcBufResBit;
const __m512i* p_ones = (__m512i*) ones512_epi8;
......@@ -95,8 +95,8 @@ static inline void nrLDPC_cnProc_BG2_AVX512(t_nrLDPC_lut* p_lut, t_nrLDPC_procBu
// Set of results pointer to correct BN address
p_cnProcBufResBit = p_cnProcBufRes + (j*bitOffsetInGroup);
__m512i *pj0 = &p_cnProcBuf[lut_idxCnProcG3[j][0]/2];
__m512i *pj1 = &p_cnProcBuf[lut_idxCnProcG3[j][1]/2];
__m512i *pj0 = &p_cnProcBuf[(lut_idxCnProcG3[j][0]/2)];
__m512i *pj1 = &p_cnProcBuf[(lut_idxCnProcG3[j][1]/2)];
// Loop over CNs
for (i=0; i<M; i++)
......@@ -108,7 +108,7 @@ static inline void nrLDPC_cnProc_BG2_AVX512(t_nrLDPC_lut* p_lut, t_nrLDPC_procBu
min = _mm512_abs_epi8(zmm0);
// 32 CNs of second BN
// zmm0 = p_cnProcBuf[lut_idxCnProcG3[j][1] + i];
// zmm0 = p_cnProcBuf[(lut_idxCnProcG3[j][1]/2) + i];
zmm0 = pj1[i];
min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0));
sgn = _mm512_xor_si512(sgn, zmm0);
......@@ -150,14 +150,14 @@ static inline void nrLDPC_cnProc_BG2_AVX512(t_nrLDPC_lut* p_lut, t_nrLDPC_procBu
for (i=0; i<M; i++)
{
// Abs and sign of 32 CNs (first BN)
zmm0 = p_cnProcBuf[lut_idxCnProcG4[j][0]/2 + i];
zmm0 = p_cnProcBuf[(lut_idxCnProcG4[j][0]/2) + i];
sgn = _mm512_xor_si512(*p_ones, zmm0);
min = _mm512_abs_epi8(zmm0);
// Loop over BNs
for (k=1; k<3; k++)
{
zmm0 = p_cnProcBuf[lut_idxCnProcG4[j][k]/2 + i];
zmm0 = p_cnProcBuf[(lut_idxCnProcG4[j][k]/2) + i];
min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0));
sgn = _mm512_xor_si512(sgn, zmm0);
}
......@@ -199,14 +199,14 @@ static inline void nrLDPC_cnProc_BG2_AVX512(t_nrLDPC_lut* p_lut, t_nrLDPC_procBu
for (i=0; i<M; i++)
{
// Abs and sign of 32 CNs (first BN)
zmm0 = p_cnProcBuf[lut_idxCnProcG5[j][0]/2 + i];
zmm0 = p_cnProcBuf[(lut_idxCnProcG5[j][0]/2) + i];
sgn = _mm512_xor_si512(*p_ones, zmm0);
min = _mm512_abs_epi8(zmm0);
// Loop over BNs
for (k=1; k<4; k++)
{
zmm0 = p_cnProcBuf[lut_idxCnProcG5[j][k]/2 + i];
zmm0 = p_cnProcBuf[(lut_idxCnProcG5[j][k]/2) + i];
min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0));
sgn = _mm512_xor_si512(sgn, zmm0);
}
......@@ -249,14 +249,14 @@ static inline void nrLDPC_cnProc_BG2_AVX512(t_nrLDPC_lut* p_lut, t_nrLDPC_procBu
for (i=0; i<M; i++)
{
// Abs and sign of 32 CNs (first BN)
zmm0 = p_cnProcBuf[lut_idxCnProcG6[j][0]/2 + i];
zmm0 = p_cnProcBuf[(lut_idxCnProcG6[j][0]/2) + i];
sgn = _mm512_xor_si512(*p_ones, zmm0);
min = _mm512_abs_epi8(zmm0);
// Loop over BNs
for (k=1; k<5; k++)
{
zmm0 = p_cnProcBuf[lut_idxCnProcG6[j][k]/2 + i];
zmm0 = p_cnProcBuf[(lut_idxCnProcG6[j][k]/2) + i];
min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0));
sgn = _mm512_xor_si512(sgn, zmm0);
}
......@@ -300,14 +300,14 @@ static inline void nrLDPC_cnProc_BG2_AVX512(t_nrLDPC_lut* p_lut, t_nrLDPC_procBu
for (i=0; i<M; i++)
{
// Abs and sign of 32 CNs (first BN)
zmm0 = p_cnProcBuf[lut_idxCnProcG8[j][0]/2 + i];
zmm0 = p_cnProcBuf[(lut_idxCnProcG8[j][0]/2) + i];
sgn = _mm512_xor_si512(*p_ones, zmm0);
min = _mm512_abs_epi8(zmm0);
// Loop over BNs
for (k=1; k<7; k++)
{
zmm0 = p_cnProcBuf[lut_idxCnProcG8[j][k]/2 + i];
zmm0 = p_cnProcBuf[(lut_idxCnProcG8[j][k]/2) + i];
min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0));
sgn = _mm512_xor_si512(sgn, zmm0);
}
......@@ -352,14 +352,14 @@ static inline void nrLDPC_cnProc_BG2_AVX512(t_nrLDPC_lut* p_lut, t_nrLDPC_procBu
for (i=0; i<M; i++)
{
// Abs and sign of 32 CNs (first BN)
zmm0 = p_cnProcBuf[lut_idxCnProcG10[j][0]/2 + i];
zmm0 = p_cnProcBuf[(lut_idxCnProcG10[j][0]/2) + i];
sgn = _mm512_xor_si512(*p_ones, zmm0);
min = _mm512_abs_epi8(zmm0);
// Loop over BNs
for (k=1; k<9; k++)
{
zmm0 = p_cnProcBuf[lut_idxCnProcG10[j][k]/2 + i];
zmm0 = p_cnProcBuf[(lut_idxCnProcG10[j][k]/2) + i];
min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0));
sgn = _mm512_xor_si512(sgn, zmm0);
}
......@@ -736,10 +736,10 @@ static inline void nrLDPC_cnProc_BG1_AVX512(t_nrLDPC_lut* p_lut, t_nrLDPC_procBu
// Offset to each bit within a group in terms of 32 Byte
uint32_t bitOffsetInGroup;
__m512i zmm0, min, sgn, zeros,maxLLR;
__m512i zmm0, min, sgn, zeros;
zeros = _mm512_setzero_si512();
maxLLR = _mm512_set1_epi8((char)127);
// maxLLR = _mm512_set1_epi8((char)127);
__m512i* p_cnProcBufResBit;
......
This source diff could not be displayed because it is too large. You can view the blob instead.
......@@ -1026,7 +1026,7 @@ static inline void nrLDPC_bn2cnProcBuf_BG1(t_nrLDPC_lut* p_lut, t_nrLDPC_procBuf
// CN group with 3 BNs
bitOffsetInGroup = lut_numCnInCnGroups_BG1_R13[0]*NR_LDPC_ZMAX;
#pragma omp simd
// #pragma omp simd
for (j=0;j<2; j++)
{
p_cnProcBuf = &cnProcBuf[lut_startAddrCnGroups[0] + j*bitOffsetInGroup];
......@@ -1041,7 +1041,7 @@ static inline void nrLDPC_bn2cnProcBuf_BG1(t_nrLDPC_lut* p_lut, t_nrLDPC_procBuf
for (j=0; j<3; j++)
{
p_cnProcBuf = &cnProcBuf[lut_startAddrCnGroups[1] + j*bitOffsetInGroup];
#pragma omp simd
// #pragma omp simd
for (i=0; i<lut_numCnInCnGroups[1]; i++)
{
idxBn = lut_startAddrBnProcBuf_CNG4[j][i] + lut_bnPosBnProcBuf_CNG4[j][i]*Z;
......@@ -1057,7 +1057,7 @@ static inline void nrLDPC_bn2cnProcBuf_BG1(t_nrLDPC_lut* p_lut, t_nrLDPC_procBuf
for (j=0; j<4; j++)
{
p_cnProcBuf = &cnProcBuf[lut_startAddrCnGroups[2] + j*bitOffsetInGroup];
#pragma omp simd
// #pragma omp simd
for (i=0; i<lut_numCnInCnGroups[2]; i++)
{
idxBn = lut_startAddrBnProcBuf_CNG5[j][i] + lut_bnPosBnProcBuf_CNG5[j][i]*Z;
......@@ -1074,7 +1074,7 @@ static inline void nrLDPC_bn2cnProcBuf_BG1(t_nrLDPC_lut* p_lut, t_nrLDPC_procBuf
for (j=0; j<5; j++)
{
p_cnProcBuf = &cnProcBuf[lut_startAddrCnGroups[3] + j*bitOffsetInGroup];
#pragma omp simd
// #pragma omp simd
for (i=0; i<lut_numCnInCnGroups[3]; i++)
{
idxBn = lut_startAddrBnProcBuf_CNG6[j][i] + lut_bnPosBnProcBuf_CNG6[j][i]*Z;
......@@ -1092,7 +1092,7 @@ static inline void nrLDPC_bn2cnProcBuf_BG1(t_nrLDPC_lut* p_lut, t_nrLDPC_procBuf
{
p_cnProcBuf = &cnProcBuf[lut_startAddrCnGroups[4] + j*bitOffsetInGroup];
#pragma omp simd
// #pragma omp simd
for (i=0; i<lut_numCnInCnGroups[4]; i++)
{
idxBn = lut_startAddrBnProcBuf_CNG7[j][i] + lut_bnPosBnProcBuf_CNG7[j][i]*Z;
......@@ -1105,7 +1105,7 @@ static inline void nrLDPC_bn2cnProcBuf_BG1(t_nrLDPC_lut* p_lut, t_nrLDPC_procBuf
// CN group with 8 BNs
bitOffsetInGroup = lut_numCnInCnGroups_BG1_R13[5]*NR_LDPC_ZMAX;
#pragma omp simd
// #pragma omp simd
for (j=0; j<7; j++)
{
p_cnProcBuf = &cnProcBuf[lut_startAddrCnGroups[5] + j*bitOffsetInGroup];
......@@ -1125,7 +1125,7 @@ static inline void nrLDPC_bn2cnProcBuf_BG1(t_nrLDPC_lut* p_lut, t_nrLDPC_procBuf
for (j=0; j<8; j++)
{
p_cnProcBuf = &cnProcBuf[lut_startAddrCnGroups[6] + j*bitOffsetInGroup];
#pragma omp simd
// #pragma omp simd
for (i=0; i<lut_numCnInCnGroups[6]; i++)
{
idxBn = lut_startAddrBnProcBuf_CNG9[j][i] + lut_bnPosBnProcBuf_CNG9[j][i]*Z;
......@@ -1142,7 +1142,7 @@ static inline void nrLDPC_bn2cnProcBuf_BG1(t_nrLDPC_lut* p_lut, t_nrLDPC_procBuf
for (j=0; j<9; j++)
{
p_cnProcBuf = &cnProcBuf[lut_startAddrCnGroups[7] + j*bitOffsetInGroup];
#pragma omp simd
// #pragma omp simd
for (i=0; i<lut_numCnInCnGroups[7]; i++)
{
idxBn = lut_startAddrBnProcBuf_CNG10[j][i] + lut_bnPosBnProcBuf_CNG10[j][i]*Z;
......@@ -1159,7 +1159,7 @@ static inline void nrLDPC_bn2cnProcBuf_BG1(t_nrLDPC_lut* p_lut, t_nrLDPC_procBuf
for (j=0; j<19; j++)
{
p_cnProcBuf = &cnProcBuf[lut_startAddrCnGroups[8] + j*bitOffsetInGroup];
#pragma omp simd
// #pragma omp simd
for (i=0; i<lut_numCnInCnGroups[8]; i++)
{
idxBn = lut_startAddrBnProcBuf_CNG19[j][i] + lut_bnPosBnProcBuf_CNG19[j][i]*Z;
......
#include <stdio.h>
#include<stdint.h>
#define NB_Z 51
void nrLDPC_bnProcPc_BG1_generator_AVX2(uint16_t,int);
void nrLDPC_bnProcPc_BG2_generator_AVX2(uint16_t,int);
void nrLDPC_bnProc_BG1_generator_AVX2(uint16_t,int);
void nrLDPC_bnProc_BG2_generator_AVX2(uint16_t,int);
#include <stdio.h>
#include <stdint.h>
#define NB_R 3
void nrLDPC_bnProc_BG1_generator_AVX2(int);
void nrLDPC_bnProc_BG2_generator_AVX2(int);
void nrLDPC_bnProcPc_BG1_generator_AVX2(int);
void nrLDPC_bnProcPc_BG2_generator_AVX2(int);
int main()
{
uint16_t Z[NB_Z]={2,3,4,5,6,7,8,9,10,11,12,13,14,15,16,18,20,22,24,26,28,30,32,36,40,44,48,52,56,60,64,72,80,88,96,104,112,120,128,144,160,176,192,208,224,240,256,288,320,352,384};
for(int i=0; i<NB_Z;i++){
int R[NB_R]={0,1,2};
for(int i=0; i<NB_R;i++){
//bnProcPc
nrLDPC_bnProcPc_BG1_generator_AVX2(Z[i], 0);
nrLDPC_bnProcPc_BG2_generator_AVX2(Z[i],0);
//bnProc
nrLDPC_bnProc_BG1_generator_AVX2(Z[i],0);
nrLDPC_bnProc_BG2_generator_AVX2(Z[i],0);
nrLDPC_bnProc_BG1_generator_AVX2(R[i]);
nrLDPC_bnProc_BG2_generator_AVX2(R[i]);
nrLDPC_bnProcPc_BG1_generator_AVX2(R[i]);
nrLDPC_bnProcPc_BG2_generator_AVX2(R[i]);
}
}
return(0);
return(0);
}
......@@ -5,7 +5,7 @@
#include "../../nrLDPC_bnProc.h"
void nrLDPC_cnProc_BG2_generator_AVX2(uint16_t Z,int R)
void nrLDPC_cnProc_BG2_generator_AVX2(int R)
{
const char *ratestr[3]={"15","13","23"};
......@@ -15,13 +15,13 @@ void nrLDPC_cnProc_BG2_generator_AVX2(uint16_t Z,int R)
// system("mkdir -p ldpc_gen_files/avx2");
char fname[50];
sprintf(fname,"../ldpc_gen_files/cnProc/nrLDPC_cnProc_BG2_Z%d_R%s_AVX2.c",Z,ratestr[R]);
sprintf(fname,"../ldpc_gen_files/cnProc/nrLDPC_cnProc_BG2_R%s_AVX2.h",ratestr[R]);
FILE *fd=fopen(fname,"w");
if (fd == NULL) {printf("Cannot create \n");abort();}
fprintf(fd,"#include <stdint.h>\n");
fprintf(fd,"#include <immintrin.h>\n");
fprintf(fd,"void nrLDPC_cnProc_BG2_Z%d_R%s_AVX2(int8_t* cnProcBuf,int8_t* cnProcBufRes) {\n",Z,ratestr[R]);
fprintf(fd,"static inline void nrLDPC_cnProc_BG2_R%s_AVX2(int8_t* cnProcBuf, int8_t* cnProcBufRes, uint16_t Z) {\n",ratestr[R]);
const uint8_t* lut_numCnInCnGroups;
const uint32_t* lut_startAddrCnGroups = lut_startAddrCnGroups_BG2;
......@@ -33,7 +33,7 @@ void nrLDPC_cnProc_BG2_generator_AVX2(uint16_t Z,int R)
// Number of CNs in Groups
uint32_t M;
//uint32_t M;
uint32_t j;
uint32_t k;
// Offset to each bit within a group in terms of 32 byte
......@@ -54,12 +54,14 @@ void nrLDPC_cnProc_BG2_generator_AVX2(uint16_t Z,int R)
fprintf(fd," __m256i ymm0, min, sgn,ones,maxLLR;\n");
fprintf(fd," ones = _mm256_set1_epi8((char)1);\n");
fprintf(fd," maxLLR = _mm256_set1_epi8((char)127);\n");
fprintf(fd," uint32_t M;\n");
if (lut_numCnInCnGroups[0] > 0)
{
// Number of groups of 32 CNs for parallel processing
// Ceil for values not divisible by 32
M = (lut_numCnInCnGroups[0]*Z + 31)>>5;
fprintf(fd," M = (%d*Z + 31)>>5;\n",lut_numCnInCnGroups[0] );
// Set the offset to each bit within a group in terms of 32 byte
bitOffsetInGroup = (lut_numCnInCnGroups_BG2_R15[0]*NR_LDPC_ZMAX)>>5;
......@@ -69,7 +71,7 @@ void nrLDPC_cnProc_BG2_generator_AVX2(uint16_t Z,int R)
for (j=0; j<3; j++)
{
fprintf(fd," for (int i=0;i<%d;i+=2) {\n",M);
fprintf(fd," for (int i=0;i<M;i+=2) {\n");
// Abs and sign of 32 CNs (first BN)
// ymm0 = p_cnProcBuf[lut_idxCnProcG3[j][0] + i];
fprintf(fd," ymm0 = ((__m256i*)cnProcBuf)[%d+i];\n",(lut_startAddrCnGroups[0]>>5)+lut_idxCnProcG3[j][0]);
......@@ -102,24 +104,7 @@ void nrLDPC_cnProc_BG2_generator_AVX2(uint16_t Z,int R)
fprintf(fd," sgn = _mm256_sign_epi8(ones, ymm0);\n");
// min = _mm256_abs_epi8(ymm0);
fprintf(fd," min = _mm256_abs_epi8(ymm0);\n");
// 32 CNs of second BN
// ymm0 = p_cnProcBuf[lut_idxCnProcG3[j][1] + i];
fprintf(fd," ymm0 = ((__m256i*)cnProcBuf)[%d+i];\n",(lut_startAddrCnGroups[0]>>5)+lut_idxCnProcG3[j][1]+1);
// min = _mm256_min_epu8(min, _mm256_abs_epi8(ymm0));
fprintf(fd," min = _mm256_min_epu8(min, _mm256_abs_epi8(ymm0));\n");
// sgn = _mm256_sign_epi8(sgn, ymm0);
fprintf(fd," sgn = _mm256_sign_epi8(sgn, ymm0);\n");
// Store result
// min = _mm256_min_epu8(min, maxLLR); // 128 in epi8 is -127
fprintf(fd," min = _mm256_min_epu8(min, maxLLR);\n");
// *p_cnProcBufResBit = _mm256_sign_epi8(min, sgn);
// p_cnProcBufResBit++;
fprintf(fd," ((__m256i*)cnProcBufRes)[%d+i] = _mm256_sign_epi8(min, sgn);\n",(lut_startAddrCnGroups[0]>>5)+(j*bitOffsetInGroup)+1);
fprintf(fd," }\n");
}
}
......@@ -135,7 +120,7 @@ void nrLDPC_cnProc_BG2_generator_AVX2(uint16_t Z,int R)
{
// Number of groups of 32 CNs for parallel processing
// Ceil for values not divisible by 32
M = (lut_numCnInCnGroups[1]*Z + 31)>>5;
fprintf(fd," M = (%d*Z + 31)>>5;\n",lut_numCnInCnGroups[1] );
// Set the offset to each bit within a group in terms of 32 byte
bitOffsetInGroup = (lut_numCnInCnGroups_BG2_R15[1]*NR_LDPC_ZMAX)>>5;
......@@ -147,7 +132,7 @@ void nrLDPC_cnProc_BG2_generator_AVX2(uint16_t Z,int R)
// Loop over CNs
fprintf(fd," for (int i=0;i<%d;i++) {\n",M);
fprintf(fd," for (int i=0;i<M;i++) {\n");
// Abs and sign of 32 CNs (first BN)
// ymm0 = p_cnProcBuf[lut_idxCnProcG3[j][0] + i];
fprintf(fd," ymm0 = ((__m256i*)cnProcBuf)[%d+i];\n",(lut_startAddrCnGroups[1]>>5)+lut_idxCnProcG4[j][0]);
......@@ -194,8 +179,7 @@ void nrLDPC_cnProc_BG2_generator_AVX2(uint16_t Z,int R)
{
// Number of groups of 32 CNs for parallel processing
// Ceil for values not divisible by 32
M = (lut_numCnInCnGroups[2]*Z + 31)>>5;
fprintf(fd," M = (%d*Z + 31)>>5;\n",lut_numCnInCnGroups[2] );
// Set the offset to each bit within a group in terms of 32 byte
bitOffsetInGroup = (lut_numCnInCnGroups_BG2_R15[2]*NR_LDPC_ZMAX)>>5;
......@@ -204,7 +188,8 @@ void nrLDPC_cnProc_BG2_generator_AVX2(uint16_t Z,int R)
for (j=0; j<5; j++)
{
fprintf(fd," for (int i=0;i<%d;i++) {\n",M);
fprintf(fd," for (int i=0;i<M;i++) {\n");
// Abs and sign of 32 CNs (first BN)
// ymm0 = p_cnProcBuf[lut_idxCnProcG3[j][0] + i];
fprintf(fd," ymm0 = ((__m256i*)cnProcBuf)[%d+i];\n",(lut_startAddrCnGroups[2]>>5)+lut_idxCnProcG5[j][0]);
......@@ -248,7 +233,7 @@ void nrLDPC_cnProc_BG2_generator_AVX2(uint16_t Z,int R)
{
// Number of groups of 32 CNs for parallel processing
// Ceil for values not divisible by 32
M = (lut_numCnInCnGroups[3]*Z + 31)>>5;
fprintf(fd, "M = (%d*Z + 31)>>5;\n",lut_numCnInCnGroups[3] );
// Set the offset to each bit within a group in terms of 32 byte
bitOffsetInGroup = (lut_numCnInCnGroups_BG2_R15[3]*NR_LDPC_ZMAX)>>5;
......@@ -261,7 +246,7 @@ void nrLDPC_cnProc_BG2_generator_AVX2(uint16_t Z,int R)
// Loop over CNs
fprintf(fd," for (int i=0;i<%d;i++) {\n",M);
fprintf(fd," for (int i=0;i<M;i++) {\n");
// Abs and sign of 32 CNs (first BN)
// ymm0 = p_cnProcBuf[lut_idxCnProcG3[j][0] + i];
fprintf(fd," ymm0 = ((__m256i*)cnProcBuf)[%d+i];\n",(lut_startAddrCnGroups[3]>>5)+lut_idxCnProcG6[j][0]);
......@@ -313,7 +298,7 @@ void nrLDPC_cnProc_BG2_generator_AVX2(uint16_t Z,int R)
{
// Number of groups of 32 CNs for parallel processing
// Ceil for values not divisible by 32
M = (lut_numCnInCnGroups[4]*Z + 31)>>5;
fprintf(fd, "M = (%d*Z + 31)>>5;\n",lut_numCnInCnGroups[4] );
// Set the offset to each bit within a group in terms of 32 byte
bitOffsetInGroup = (lut_numCnInCnGroups_BG2_R15[4]*NR_LDPC_ZMAX)>>5;
......@@ -324,7 +309,7 @@ void nrLDPC_cnProc_BG2_generator_AVX2(uint16_t Z,int R)
{
// Loop over CNs
fprintf(fd," for (int i=0;i<%d;i++) {\n",M);
fprintf(fd," for (int i=0;i<M;i++) {\n");
// Abs and sign of 32 CNs (first BN)
// ymm0 = p_cnProcBuf[lut_idxCnProcG3[j][0] + i];
fprintf(fd," ymm0 = ((__m256i*)cnProcBuf)[%d+i];\n",(lut_startAddrCnGroups[4]>>5)+lut_idxCnProcG8[j][0]);
......@@ -375,7 +360,7 @@ void nrLDPC_cnProc_BG2_generator_AVX2(uint16_t Z,int R)
{
// Number of groups of 32 CNs for parallel processing
// Ceil for values not divisible by 32
M = (lut_numCnInCnGroups[5]*Z + 31)>>5;
fprintf(fd, "M = (%d*Z + 31)>>5;\n",lut_numCnInCnGroups[5] );
// Set the offset to each bit within a group in terms of 32 byte
bitOffsetInGroup = (lut_numCnInCnGroups_BG2_R15[5]*NR_LDPC_ZMAX)>>5;
......@@ -387,7 +372,7 @@ void nrLDPC_cnProc_BG2_generator_AVX2(uint16_t Z,int R)
// Loop over CNs
fprintf(fd," for (int i=0;i<%d;i++) {\n",M);
fprintf(fd," for (int i=0;i<M;i++) {\n");
// Abs and sign of 32 CNs (first BN)
// ymm0 = p_cnProcBuf[lut_idxCnProcG3[j][0] + i];
fprintf(fd," ymm0 = ((__m256i*)cnProcBuf)[%d+i];\n",(lut_startAddrCnGroups[5]>>5)+lut_idxCnProcG10[j][0]);
......@@ -423,3 +408,4 @@ void nrLDPC_cnProc_BG2_generator_AVX2(uint16_t Z,int R)
fprintf(fd,"}\n");
fclose(fd);
}//end of the function nrLDPC_cnProc_BG2
#include <stdio.h>
#include <stdint.h>
#define NB_Z 51
void nrLDPC_cnProc_BG1_generator_AVX2(uint16_t,int);
void nrLDPC_cnProc_BG2_generator_AVX2(uint16_t,int);
#define NB_R 3
void nrLDPC_cnProc_BG1_generator_AVX2(int);
void nrLDPC_cnProc_BG2_generator_AVX2(int);
int main()
{
uint16_t Z[NB_Z]={2,3,4,5,6,7,8,9,10,11,12,13,14,15,16,18,20,22,24,26,28,30,32,36,40,44,48,52,56,60,64,72,80,88,96,104,112,120,128,144,160,176,192,208,224,240,256,288,320,352,384};
int R[NB_R]={0,1,2};
for(int i=0; i<NB_R;i++){
for(int i=0; i<NB_Z;i++){
nrLDPC_cnProc_BG1_generator_AVX2(R[i]);
nrLDPC_cnProc_BG2_generator_AVX2(R[i]);
nrLDPC_cnProc_BG1_generator_AVX2(Z[i],0);
nrLDPC_cnProc_BG2_generator_AVX2(Z[i],0);
}
}
return(0);
}
......@@ -8,7 +8,7 @@ OBJ= $(SRC:.c=.o)
all: $(EXEC)
cnProc_gen_avx512: $(OBJ)
@$(CC) -o $@ $^ $(LDFLAGS) -O2
@$(CC) -o $@ $^ $(LDFLAGS) -O3
#main.o: cnProc_gen_avx512.h
......
......@@ -3,7 +3,7 @@
#include <stdint.h>
#include "../../nrLDPCdecoder_defs.h"
void nrLDPC_cnProc_BG2_generator_AVX512(uint16_t Z,int R)
void nrLDPC_cnProc_BG2_generator_AVX512(int R)
{
const char *ratestr[3]={"15","13","23"};
......@@ -13,19 +13,19 @@ void nrLDPC_cnProc_BG2_generator_AVX512(uint16_t Z,int R)
// system("mkdir -p ../ldpc_gen_files");
char fname[50];
sprintf(fname,"../ldpc_gen_files/nrLDPC_cnProc_BG2_Z%d_R%s_AVX512.c",Z,ratestr[R]);
sprintf(fname,"../ldpc_gen_files/cnProc_avx512/nrLDPC_cnProc_BG2_R%s_AVX512.h",ratestr[R]);
FILE *fd=fopen(fname,"w");
if (fd == NULL) {printf("Cannot create \n");abort();}
fprintf(fd,"#include <stdint.h>\n");
fprintf(fd,"#include <immintrin.h>\n");
//fprintf(fd,"#include <stdint.h>\n");
// fprintf(fd,"#include <immintrin.h>\n");
fprintf(fd, "#define conditional_negate(a,b,z) _mm512_mask_sub_epi8(a,_mm512_movepi8_mask(b),z,a)\n");
fprintf(fd,"void nrLDPC_cnProc_BG2_Z%d_R%s_AVX512(int8_t* cnProcBuf,int8_t* cnProcBufRes) {\n",Z,ratestr[R]);
fprintf(fd,"static inline void nrLDPC_cnProc_BG2_R%s_AVX512(int8_t* cnProcBuf, int8_t* cnProcBufRes, uint16_t Z) {\n",ratestr[R]);
const uint8_t* lut_numCnInCnGroups;
const uint32_t* lut_startAddrCnGroups = lut_startAddrCnGroups_BG2;
......@@ -36,13 +36,16 @@ void nrLDPC_cnProc_BG2_generator_AVX512(uint16_t Z,int R)
// Number of CNs in Groups
uint32_t M;
//uint32_t M;
uint32_t j;
uint32_t k;
// Offset to each bit within a group in terms of 64 Byte
uint32_t bitOffsetInGroup;
fprintf(fd," uint32_t M;\n");
fprintf(fd," __m512i zmm0, min, sgn,zeros,maxLLR;\n");
fprintf(fd," zeros = _mm512_setzero_si512();\n");
fprintf(fd," maxLLR = _mm512_set1_epi8((char)127);\n");
// =====================================================================
// Process group with 3 BNs
fprintf(fd,"//Process group with 3 BNs\n");
......@@ -52,15 +55,11 @@ void nrLDPC_cnProc_BG2_generator_AVX512(uint16_t Z,int R)
const uint8_t lut_idxCnProcG3[3][2] = {{72,144}, {0,144}, {0,72}};
fprintf(fd," __m512i zmm0, min, sgn,zeros,maxLLR;\n");
fprintf(fd," zeros = _mm512_setzero_si512();\n");
fprintf(fd," maxLLR = _mm512_set1_epi8((char)127);\n");
if (lut_numCnInCnGroups[0] > 0)
{
// Number of groups of 64 CNs for parallel processing
// Ceil for values not divisible by 64
M = (lut_numCnInCnGroups[0]*Z + 63)>>6;
fprintf(fd," M = (%d*Z + 63)>>6;\n",lut_numCnInCnGroups[1] );
// Set the offset to each bit within a group in terms of 64 Byte
bitOffsetInGroup = (lut_numCnInCnGroups_BG2_R15[0]*NR_LDPC_ZMAX)>>6;
......@@ -72,13 +71,13 @@ void nrLDPC_cnProc_BG2_generator_AVX512(uint16_t Z,int R)
{
fprintf(fd," for (int i=0;i<%d;i+=2) {\n",M);
fprintf(fd," for (int i=0;i<M;i+=2) {\n");
// Abs and sign of 64 CNs (first BN)
// zmm0 = p_cnProcBuf[lut_idxCnProcG3[j][0] + i];
fprintf(fd," sgn = ((__m512i*)cnProcBuf)[%d+i];\n",(lut_startAddrCnGroups[0]>>6)+lut_idxCnProcG3[j][0]/2);
fprintf(fd," zmm0 = ((__m512i*)cnProcBuf)[%d+i];\n",(lut_startAddrCnGroups[0]>>6)+lut_idxCnProcG3[j][0]/2);
// sgn = _mm512_sign_epi8(ones, zmm0);
// min = _mm512_abs_epi8(zmm0);
fprintf(fd," min = _mm512_abs_epi8(sgn);\n");
fprintf(fd," min = _mm512_abs_epi8(zmm0);\n");
// 32 CNs of second BN
// zmm0 = p_cnProcBuf[lut_idxCnProcG3[j][1] + i];
......@@ -134,7 +133,7 @@ void nrLDPC_cnProc_BG2_generator_AVX512(uint16_t Z,int R)
{
// Number of groups of 64 CNs for parallel processing
// Ceil for values not divisible by 64
M = (lut_numCnInCnGroups[1]*Z + 63)>>6;
fprintf(fd," M = (%d*Z + 63)>>6;\n",lut_numCnInCnGroups[1] );
// Set the offset to each bit within a group in terms of 64 Byte
bitOffsetInGroup = (lut_numCnInCnGroups_BG2_R15[1]*NR_LDPC_ZMAX)>>6;
......@@ -142,7 +141,7 @@ void nrLDPC_cnProc_BG2_generator_AVX512(uint16_t Z,int R)
// Loop over every BN
for (j=0; j<4; j++)
{
fprintf(fd," for (int i=0;i<%d;i++) {\n",M);
fprintf(fd," for (int i=0;i<M;i++) {\n");
// Abs and sign of 64 CNs (first BN)
// zmm0 = p_cnProcBuf[lut_idxCnProcG3[j][0] + i];
fprintf(fd," sgn = ((__m512i*)cnProcBuf)[%d+i];\n",(lut_startAddrCnGroups[1]>>6)+lut_idxCnProcG4[j][0]/2);
......@@ -187,7 +186,7 @@ void nrLDPC_cnProc_BG2_generator_AVX512(uint16_t Z,int R)
{
// Number of groups of 64 CNs for parallel processing
// Ceil for values not divisible by 64
M = (lut_numCnInCnGroups[2]*Z + 63)>>6;
fprintf(fd," M = (%d*Z + 63)>>6;\n",lut_numCnInCnGroups[2] );
// Set the offset to each bit within a group in terms of 64 Byte
bitOffsetInGroup = (lut_numCnInCnGroups_BG2_R15[2]*NR_LDPC_ZMAX)>>6;
......@@ -197,7 +196,7 @@ void nrLDPC_cnProc_BG2_generator_AVX512(uint16_t Z,int R)
for (j=0; j<5; j++)
{
fprintf(fd," for (int i=0;i<%d;i++) {\n",M);
fprintf(fd," for (int i=0;i<M;i++) {\n");
// Abs and sign of 64 CNs (first BN)
// zmm0 = p_cnProcBuf[lut_idxCnProcG3[j][0] + i];
fprintf(fd," sgn = ((__m512i*)cnProcBuf)[%d+i];\n",(lut_startAddrCnGroups[2]>>6)+lut_idxCnProcG5[j][0]/2);
......@@ -241,7 +240,7 @@ void nrLDPC_cnProc_BG2_generator_AVX512(uint16_t Z,int R)
{
// Number of groups of 64 CNs for parallel processing
// Ceil for values not divisible by 64
M = (lut_numCnInCnGroups[3]*Z + 63)>>6;
fprintf(fd," M = (%d*Z + 63)>>6;\n",lut_numCnInCnGroups[3] );
// Set the offset to each bit within a group in terms of 64 Byte
bitOffsetInGroup = (lut_numCnInCnGroups_BG2_R15[3]*NR_LDPC_ZMAX)>>6;
......@@ -251,7 +250,7 @@ void nrLDPC_cnProc_BG2_generator_AVX512(uint16_t Z,int R)
for (j=0; j<6; j++)
{
fprintf(fd," for (int i=0;i<%d;i++) {\n",M);
fprintf(fd," for (int i=0;i<M;i++) {\n");
// Abs and sign of 64 CNs (first BN)
// zmm0 = p_cnProcBuf[lut_idxCnProcG3[j][0] + i];
fprintf(fd," sgn = ((__m512i*)cnProcBuf)[%d+i];\n",(lut_startAddrCnGroups[3]>>6)+lut_idxCnProcG6[j][0]/2);
......@@ -297,7 +296,7 @@ void nrLDPC_cnProc_BG2_generator_AVX512(uint16_t Z,int R)
{
// Number of groups of 64 CNs for parallel processing
// Ceil for values not divisible by 64
M = (lut_numCnInCnGroups[4]*Z + 63)>>6;
fprintf(fd," M = (%d*Z + 63)>>6;\n",lut_numCnInCnGroups[4] );
// Set the offset to each bit within a group in terms of 64 Byte
bitOffsetInGroup = (lut_numCnInCnGroups_BG2_R15[4]*NR_LDPC_ZMAX)>>6;
......@@ -307,7 +306,7 @@ void nrLDPC_cnProc_BG2_generator_AVX512(uint16_t Z,int R)
for (j=0; j<8; j++)
{
fprintf(fd," for (int i=0;i<%d;i++) {\n",M);
fprintf(fd," for (int i=0;i<M;i++) {\n");
// Abs and sign of 64 CNs (first BN)
// zmm0 = p_cnProcBuf[lut_idxCnProcG3[j][0] + i];
fprintf(fd," sgn = ((__m512i*)cnProcBuf)[%d+i];\n",(lut_startAddrCnGroups[4]>>6)+lut_idxCnProcG8[j][0]/2);
......@@ -353,7 +352,7 @@ void nrLDPC_cnProc_BG2_generator_AVX512(uint16_t Z,int R)
{
// Number of groups of 64 CNs for parallel processing
// Ceil for values not divisible by 64
M = (lut_numCnInCnGroups[5]*Z + 63)>>6;
fprintf(fd," M = (%d*Z + 63)>>6;\n",lut_numCnInCnGroups[5] );
// Set the offset to each bit within a group in terms of 64 Byte
bitOffsetInGroup = (lut_numCnInCnGroups_BG2_R15[5]*NR_LDPC_ZMAX)>>6;
......@@ -363,7 +362,7 @@ void nrLDPC_cnProc_BG2_generator_AVX512(uint16_t Z,int R)
for (j=0; j<10; j++)
{
fprintf(fd," for (int i=0;i<%d;i++) {\n",M);
fprintf(fd," for (int i=0;i<M;i++) {\n");
// Abs and sign of 64 CNs (first BN)
// zmm0 = p_cnProcBuf[lut_idxCnProcG3[j][0] + i];
fprintf(fd," sgn = ((__m512i*)cnProcBuf)[%d+i];\n",(lut_startAddrCnGroups[5]>>6)+lut_idxCnProcG10[j][0]/2);
......@@ -404,3 +403,4 @@ void nrLDPC_cnProc_BG2_generator_AVX512(uint16_t Z,int R)
#include <stdio.h>
#include <stdint.h>
#define NB_Z 51
void nrLDPC_cnProc_BG1_generator_AVX512(uint16_t,int);
void nrLDPC_cnProc_BG2_generator_AVX512(uint16_t,int);
#define NB_R 3
void nrLDPC_cnProc_BG1_generator_AVX512(int);
void nrLDPC_cnProc_BG2_generator_AVX512(int);
int main()
{
uint16_t Z[NB_Z]={2,3,4,5,6,7,8,9,10,11,12,13,14,15,16,18,20,22,24,26,28,30,32,36,40,44,48,52,56,60,64,72,80,88,96,104,112,120,128,144,160,176,192,208,224,240,256,288,320,352,384};
int R[NB_R]={0,1,2};
for(int i=0; i<NB_R;i++){
nrLDPC_cnProc_BG1_generator_AVX512(R[i]);
nrLDPC_cnProc_BG2_generator_AVX512(R[i]);
for(int i=0; i<NB_Z;i++){
}
nrLDPC_cnProc_BG1_generator_AVX512(Z[i],0);
nrLDPC_cnProc_BG2_generator_AVX512(Z[i],0);
}
return(0);
}
static inline void nrLDPC_bnProc_BG1_R89_AVX2(int8_t* bnProcBuf,int8_t* bnProcBufRes, int8_t* llrRes, uint16_t Z ) {
__m256i* p_bnProcBuf;
__m256i* p_bnProcBufRes;
__m256i* p_llrRes;
__m256i* p_res;
uint32_t M, i;
// Process group with 2 CNs
M = (3*Z + 31)>>5;
p_bnProcBuf = (__m256i*) &bnProcBuf [384];
p_bnProcBufRes = (__m256i*) &bnProcBufRes [384];
p_res = &p_bnProcBufRes[0];
p_llrRes = (__m256i*) &llrRes [384];
for (i=0;i<M;i++) {
*p_res = _mm256_subs_epi8(*p_llrRes, p_bnProcBuf[0 + i]);
p_res++;
p_llrRes++;
}
p_res = &p_bnProcBufRes[36];
p_llrRes = (__m256i*) &llrRes [384];
for (i=0;i<M;i++) {
*p_res = _mm256_subs_epi8(*p_llrRes, p_bnProcBuf[36 + i]);
p_res++;
p_llrRes++;
}
// Process group with 3 CNs
M = (21*Z + 31)>>5;
p_bnProcBuf = (__m256i*) &bnProcBuf [2688];
p_bnProcBufRes = (__m256i*) &bnProcBufRes [2688];
p_res = &p_bnProcBufRes[0];
p_llrRes = (__m256i*) &llrRes [1536];
for (i=0;i<M;i++) {
*p_res = _mm256_subs_epi8(*p_llrRes, p_bnProcBuf[0 + i]);
p_res++;
p_llrRes++;
}
p_res = &p_bnProcBufRes[252];
p_llrRes = (__m256i*) &llrRes [1536];
for (i=0;i<M;i++) {
*p_res = _mm256_subs_epi8(*p_llrRes, p_bnProcBuf[252 + i]);
p_res++;
p_llrRes++;
}
p_res = &p_bnProcBufRes[504];
p_llrRes = (__m256i*) &llrRes [1536];
for (i=0;i<M;i++) {
*p_res = _mm256_subs_epi8(*p_llrRes, p_bnProcBuf[504 + i]);
p_res++;
p_llrRes++;
}
// Process group with 4 CNs
M = (1*Z + 31)>>5;
p_bnProcBuf = (__m256i*) &bnProcBuf [26880];
p_bnProcBufRes = (__m256i*) &bnProcBufRes [26880];
p_res = &p_bnProcBufRes[0];
p_llrRes = (__m256i*) &llrRes [9600];
for (i=0;i<M;i++) {
*p_res = _mm256_subs_epi8(*p_llrRes, p_bnProcBuf[0 + i]);
p_res++;
p_llrRes++;
}
p_res = &p_bnProcBufRes[12];
p_llrRes = (__m256i*) &llrRes [9600];
for (i=0;i<M;i++) {
*p_res = _mm256_subs_epi8(*p_llrRes, p_bnProcBuf[12 + i]);
p_res++;
p_llrRes++;
}
p_res = &p_bnProcBufRes[24];
p_llrRes = (__m256i*) &llrRes [9600];
for (i=0;i<M;i++) {
*p_res = _mm256_subs_epi8(*p_llrRes, p_bnProcBuf[24 + i]);
p_res++;
p_llrRes++;
}
p_res = &p_bnProcBufRes[36];
p_llrRes = (__m256i*) &llrRes [9600];
for (i=0;i<M;i++) {
*p_res = _mm256_subs_epi8(*p_llrRes, p_bnProcBuf[36 + i]);
p_res++;
p_llrRes++;
}
// Process group with 5 CNs
M = (1*Z + 31)>>5;
p_bnProcBuf = (__m256i*) &bnProcBuf [28416];
p_bnProcBufRes = (__m256i*) &bnProcBufRes [28416];
p_res = &p_bnProcBufRes[0];
p_llrRes = (__m256i*) &llrRes [9984];
for (i=0;i<M;i++) {
*p_res = _mm256_subs_epi8(*p_llrRes, p_bnProcBuf[0 + i]);
p_res++;
p_llrRes++;
}
p_res = &p_bnProcBufRes[12];
p_llrRes = (__m256i*) &llrRes [9984];
for (i=0;i<M;i++) {
*p_res = _mm256_subs_epi8(*p_llrRes, p_bnProcBuf[12 + i]);
p_res++;
p_llrRes++;
}
p_res = &p_bnProcBufRes[24];
p_llrRes = (__m256i*) &llrRes [9984];
for (i=0;i<M;i++) {
*p_res = _mm256_subs_epi8(*p_llrRes, p_bnProcBuf[24 + i]);
p_res++;
p_llrRes++;
}
p_res = &p_bnProcBufRes[36];
p_llrRes = (__m256i*) &llrRes [9984];
for (i=0;i<M;i++) {
*p_res = _mm256_subs_epi8(*p_llrRes, p_bnProcBuf[36 + i]);
p_res++;
p_llrRes++;
}
p_res = &p_bnProcBufRes[48];
p_llrRes = (__m256i*) &llrRes [9984];
for (i=0;i<M;i++) {
*p_res = _mm256_subs_epi8(*p_llrRes, p_bnProcBuf[48 + i]);
p_res++;
p_llrRes++;
}
// Process group with 6 CNs
// Process group with 7 CNs
// Process group with 8 CNs
// Process group with 9 CNs
// Process group with 10 CNs
// Process group with 11 CNs
// Process group with 12 CNs
// Process group with 13 CNs
// Process group with 14 CNs
// Process group with 15 CNs
// Process group with 16 CNs
// Process group with 17 CNs
// Process group with 18 CNs
// Process group with 19 CNs
// Process group with 20 CNs
// Process group with 21 CNs
// Process group with 22 CNs
// Process group with <23 CNs
// Process group with 24 CNs
// Process group with 25 CNs
// Process group with 26 CNs
// Process group with 27 CNs
// Process group with 28 CNs
// Process group with 29 CNs
// Process group with 30 CNs
}
Markdown is supported
0%
or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment