Commit 71a8d19b authored by Sy's avatar Sy

use of avx2 & avx512 at CN Processing level

parent 3ed622af
......@@ -1305,26 +1305,30 @@ set(PHY_TURBOIF
set(PHY_LDPC_ORIG_SRC
${OPENAIR1_DIR}/PHY/CODING/nrLDPC_decoder/nrLDPC_decoder.c
${OPENAIR1_DIR}/PHY/CODING/nrLDPC_encoder/ldpc_encoder.c
${OPENAIR1_DIR}/PHY/CODING/nrLDPC_decoder/nrLDPC_tools/ldpc_gen_files/nrLDPC_cnProc_BG1_Z384_13.c
${OPENAIR1_DIR}/PHY/CODING/nrLDPC_decoder/nrLDPC_tools/ldpc_gen_files/nrLDPC_cnProc_BG1_Z384_13_AVX2.c
${OPENAIR1_DIR}/PHY/CODING/nrLDPC_decoder/nrLDPC_tools/ldpc_gen_files/nrLDPC_cnProc_BG1_Z384_13_AVX512.c
)
set(PHY_LDPC_OPTIM_SRC
${OPENAIR1_DIR}/PHY/CODING/nrLDPC_decoder/nrLDPC_decoder.c
${OPENAIR1_DIR}/PHY/CODING/nrLDPC_encoder/ldpc_encoder_optim.c
${OPENAIR1_DIR}/PHY/CODING/nrLDPC_decoder/nrLDPC_tools/ldpc_gen_files/nrLDPC_cnProc_BG1_Z384_13.c
${OPENAIR1_DIR}/PHY/CODING/nrLDPC_decoder/nrLDPC_tools/ldpc_gen_files/nrLDPC_cnProc_BG1_Z384_13_AVX2.c
${OPENAIR1_DIR}/PHY/CODING/nrLDPC_decoder/nrLDPC_tools/ldpc_gen_files/nrLDPC_cnProc_BG1_Z384_13_AVX512.c
)
set(PHY_LDPC_OPTIM8SEG_SRC
${OPENAIR1_DIR}/PHY/CODING/nrLDPC_decoder/nrLDPC_decoder.c
${OPENAIR1_DIR}/PHY/CODING/nrLDPC_encoder/ldpc_encoder_optim8seg.c
${OPENAIR1_DIR}/PHY/CODING/nrLDPC_decoder/nrLDPC_tools/ldpc_gen_files/nrLDPC_cnProc_BG1_Z384_13.c
${OPENAIR1_DIR}/PHY/CODING/nrLDPC_decoder/nrLDPC_tools/ldpc_gen_files/nrLDPC_cnProc_BG1_Z384_13_AVX2.c
${OPENAIR1_DIR}/PHY/CODING/nrLDPC_decoder/nrLDPC_tools/ldpc_gen_files/nrLDPC_cnProc_BG1_Z384_13_AVX2.c
)
set(PHY_LDPC_OPTIM8SEGMULTI_SRC
${OPENAIR1_DIR}/PHY/CODING/nrLDPC_decoder/nrLDPC_decoder.c
${OPENAIR1_DIR}/PHY/CODING/nrLDPC_encoder/ldpc_encoder_optim8segmulti.c
${OPENAIR1_DIR}/PHY/CODING/nrLDPC_decoder/nrLDPC_tools/ldpc_gen_files/nrLDPC_cnProc_BG1_Z384_13.c
${OPENAIR1_DIR}/PHY/CODING/nrLDPC_decoder/nrLDPC_tools/ldpc_gen_files/nrLDPC_cnProc_BG1_Z384_13_AVX2.c
${OPENAIR1_DIR}/PHY/CODING/nrLDPC_decoder/nrLDPC_tools/ldpc_gen_files/nrLDPC_cnProc_BG1_Z384_13_AVX512.c
)
set(PHY_NR_CODINGIF
......
......@@ -147,7 +147,8 @@ static inline uint32_t nrLDPC_decoder_core(int8_t* p_llr, int8_t* p_out, t_nrLDP
#endif
if (BG == 1)
{ if(Z==384){
nrLDPC_cnProc_BG1_Z384_13(p_procBuf->cnProcBuf,p_procBuf->cnProcBufRes);
nrLDPC_cnProc_BG1_Z384_13_AVX512(p_procBuf->cnProcBuf,p_procBuf->cnProcBufRes);
//nrLDPC_cnProc_BG1_Z384_13_AVX2(p_procBuf->cnProcBuf,p_procBuf->cnProcBufRes);
}else{
nrLDPC_cnProc_BG1(p_lut, p_procBuf, Z);
}
......@@ -250,7 +251,8 @@ static inline uint32_t nrLDPC_decoder_core(int8_t* p_llr, int8_t* p_out, t_nrLDP
if (BG == 1)
{
if(Z==384){
nrLDPC_cnProc_BG1_Z384_13(p_procBuf->cnProcBuf, p_procBuf->cnProcBufRes);
nrLDPC_cnProc_BG1_Z384_13_AVX512(p_procBuf->cnProcBuf,p_procBuf->cnProcBufRes);
//nrLDPC_cnProc_BG1_Z384_13_AVX2(p_procBuf->cnProcBuf,p_procBuf->cnProcBufRes);
}else{
nrLDPC_cnProc_BG1(p_lut, p_procBuf, Z);
}
......@@ -363,7 +365,8 @@ static inline uint32_t nrLDPC_decoder_core(int8_t* p_llr, int8_t* p_out, t_nrLDP
#endif
if (BG == 1)
{ if(Z==384){
nrLDPC_cnProc_BG1_Z384_13(p_procBuf->cnProcBuf,p_procBuf->cnProcBufRes);
nrLDPC_cnProc_BG1_Z384_13_AVX512(p_procBuf->cnProcBuf,p_procBuf->cnProcBufRes);
//nrLDPC_cnProc_BG1_Z384_13_AVX2(p_procBuf->cnProcBuf,p_procBuf->cnProcBufRes);
}else{
nrLDPC_cnProc_BG1(p_lut, p_procBuf, Z);
}
......
C=gcc
CFLAGS=-W -Wall -mavx2
LDFLAGS=
EXEC=cnProc_gen
EXEC=cnProc_gen_avx2
SRC= $(wildcard *.c)
OBJ= $(SRC:.c=.o)
all: $(EXEC)
cnProc_gen: $(OBJ)
cnProc_gen_avx2: $(OBJ)
$(CC) -o $@ $^ $(LDFLAGS) -O2 -pg
main.o: cnProc_gen.h
main.o: cnProc_gen_avx2.h
%.o: %.c
$(CC) -o $@ -c $< $(CFLAGS) -I ${OPENAIR_HOME}/openair1 -g -pg
......@@ -24,4 +24,4 @@ mrproper: clean
rm -rf $(EXEC)
zip:
tar -zcvf sauvegarde.tar.gz main.c cnProc_gen.c cnProc_geno.h Makefile
tar -zcvf sauvegarde.tar.gz main.c cnProc_gen_avx2.c cnProc_gen_avx2.h Makefile
#include <stdint.h>
#include <immintrin.h>
#include "../../nrLDPCdecoder_defs.h"
#include "../../nrLDPC_types.h"
#include "../../nrLDPC_bnProc.h"
#include "cnProc_gen_avx2.h"
void nrLDPC_cnProc_BG1_generator_AVX2(uint16_t Z,int R)
{
const char *ratestr[3]={"13","23","89"};
if (R<0 || R>2) {printf("Illegal R %d\n",R); abort();}
// system("mkdir -p ldpc_gen_files");
char fname[50];
sprintf(fname,"../ldpc_gen_files/nrLDPC_cnProc_BG1_Z%d_%s_AVX2.c",Z,ratestr[R]);
FILE *fd=fopen(fname,"w");
if (fd == NULL) {printf("Cannot create %s\n");abort();}
fprintf(fd,"#include <stdint.h>\n");
fprintf(fd,"#include <immintrin.h>\n");
fprintf(fd,"void nrLDPC_cnProc_BG1_Z%d_%s_AVX2(int8_t* cnProcBuf,int8_t* cnProcBufRes) {\n",Z,ratestr[R]);
const uint8_t* lut_numCnInCnGroups;
const uint32_t* lut_startAddrCnGroups = lut_startAddrCnGroups_BG1;
if (R==0) lut_numCnInCnGroups = lut_numCnInCnGroups_BG1_R13;
else if (R==1) lut_numCnInCnGroups = lut_numCnInCnGroups_BG1_R23;
else if (R==2) lut_numCnInCnGroups = lut_numCnInCnGroups_BG1_R89;
else { printf("aborting, illegal R %d\n",R); fclose(fd);abort();}
//__m256i* p_cnProcBuf;
//__m256i* p_cnProcBufRes;
// Number of CNs in Groups
uint32_t M;
uint32_t j;
uint32_t k;
// Offset to each bit within a group in terms of 32 Byte
uint32_t bitOffsetInGroup;
//__m256i ymm0, min, sgn;
//__m256i* p_cnProcBufResBit;
// const __m256i* p_ones = (__m256i*) ones256_epi8;
// const __m256i* p_maxLLR = (__m256i*) maxLLR256_epi8;
// LUT with offsets for bits that need to be processed
// 1. bit proc requires LLRs of 2. and 3. bit, 2.bits of 1. and 3. etc.
// Offsets are in units of bitOffsetInGroup (1*384/32)
// const uint8_t lut_idxCnProcG3[3][2] = {{12,24}, {0,24}, {0,12}};
// =====================================================================
// Process group with 3 BNs
fprintf(fd,"//Process group with 3 BNs\n");
// LUT with offsets for bits that need to be processed
// 1. bit proc requires LLRs of 2. and 3. bit, 2.bits of 1. and 3. etc.
// Offsets are in units of bitOffsetInGroup (1*384/32)
const uint8_t lut_idxCnProcG3[3][2] = {{12,24}, {0,24}, {0,12}};
fprintf(fd," __m256i ymm0, min, sgn,ones,maxLLR;\n");
fprintf(fd," ones = _mm256_set1_epi8((char)1);\n");
fprintf(fd," maxLLR = _mm256_set1_epi8((char)127);\n");
if (lut_numCnInCnGroups[0] > 0)
{
// Number of groups of 32 CNs for parallel processing
// Ceil for values not divisible by 32
M = (lut_numCnInCnGroups[0]*Z + 31)>>5;
// Set the offset to each bit within a group in terms of 32 Byte
bitOffsetInGroup = (lut_numCnInCnGroups_BG1_R13[0]*NR_LDPC_ZMAX)>>5;
// Set pointers to start of group 3
//p_cnProcBuf = (__m256i*) &cnProcBuf [lut_startAddrCnGroups[0]];
//p_cnProcBufRes = (__m256i*) &cnProcBufRes[lut_startAddrCnGroups[0]];
// Loop over every BN
int iprime=0;
for (j=0; j<3; j++)
{
// Set of results pointer to correct BN address
//p_cnProcBufResBit = p_cnProcBufRes + (j*bitOffsetInGroup);
// Loop over CNs
// for (i=0; i<M; i++,iprime++)
// {
fprintf(fd," for (int i=0;i<%d;i+=2) {\n",M);
// Abs and sign of 32 CNs (first BN)
// ymm0 = p_cnProcBuf[lut_idxCnProcG3[j][0] + i];
fprintf(fd," ymm0 = ((__m256i*)cnProcBuf)[%d+i];\n",(lut_startAddrCnGroups[0]>>5)+lut_idxCnProcG3[j][0]);
// sgn = _mm256_sign_epi8(ones, ymm0);
fprintf(fd," sgn = _mm256_sign_epi8(ones, ymm0);\n");
// min = _mm256_abs_epi8(ymm0);
fprintf(fd," min = _mm256_abs_epi8(ymm0);\n");
// 32 CNs of second BN
// ymm0 = p_cnProcBuf[lut_idxCnProcG3[j][1] + i];
fprintf(fd," ymm0 = ((__m256i*)cnProcBuf)[%d+i];\n",(lut_startAddrCnGroups[0]>>5)+lut_idxCnProcG3[j][1]);
// min = _mm256_min_epu8(min, _mm256_abs_epi8(ymm0));
fprintf(fd," min = _mm256_min_epu8(min, _mm256_abs_epi8(ymm0));\n");
// sgn = _mm256_sign_epi8(sgn, ymm0);
fprintf(fd," sgn = _mm256_sign_epi8(sgn, ymm0);\n");
// Store result
// min = _mm256_min_epu8(min, maxLLR); // 128 in epi8 is -127
fprintf(fd," min = _mm256_min_epu8(min, maxLLR);\n");
// *p_cnProcBufResBit = _mm256_sign_epi8(min, sgn);
// p_cnProcBufResBit++;
fprintf(fd," ((__m256i*)cnProcBufRes)[%d+i] = _mm256_sign_epi8(min, sgn);\n",(lut_startAddrCnGroups[0]>>5)+(j*bitOffsetInGroup));
// Abs and sign of 32 CNs (first BN)
// ymm0 = p_cnProcBuf[lut_idxCnProcG3[j][0] + i];
fprintf(fd," ymm0 = ((__m256i*)cnProcBuf)[%d+i];\n",(lut_startAddrCnGroups[0]>>5)+lut_idxCnProcG3[j][0]+1);
// sgn = _mm256_sign_epi8(ones, ymm0);
fprintf(fd," sgn = _mm256_sign_epi8(ones, ymm0);\n");
// min = _mm256_abs_epi8(ymm0);
fprintf(fd," min = _mm256_abs_epi8(ymm0);\n");
// 32 CNs of second BN
// ymm0 = p_cnProcBuf[lut_idxCnProcG3[j][1] + i];
fprintf(fd," ymm0 = ((__m256i*)cnProcBuf)[%d+i];\n",(lut_startAddrCnGroups[0]>>5)+lut_idxCnProcG3[j][1]+1);
// min = _mm256_min_epu8(min, _mm256_abs_epi8(ymm0));
fprintf(fd," min = _mm256_min_epu8(min, _mm256_abs_epi8(ymm0));\n");
// sgn = _mm256_sign_epi8(sgn, ymm0);
fprintf(fd," sgn = _mm256_sign_epi8(sgn, ymm0);\n");
// Store result
// min = _mm256_min_epu8(min, maxLLR); // 128 in epi8 is -127
fprintf(fd," min = _mm256_min_epu8(min, maxLLR);\n");
// *p_cnProcBufResBit = _mm256_sign_epi8(min, sgn);
// p_cnProcBufResBit++;
fprintf(fd," ((__m256i*)cnProcBufRes)[%d+i] = _mm256_sign_epi8(min, sgn);\n",(lut_startAddrCnGroups[0]>>5)+(j*bitOffsetInGroup)+1);
fprintf(fd," }\n");
}
}
// =====================================================================
// Process group with 4 BNs
fprintf(fd,"//Process group with 4 BNs\n");
// Offset is 5*384/32 = 60
const uint8_t lut_idxCnProcG4[4][3] = {{60,120,180}, {0,120,180}, {0,60,180}, {0,60,120}};
if (lut_numCnInCnGroups[1] > 0)
{
// Number of groups of 32 CNs for parallel processing
// Ceil for values not divisible by 32
M = (lut_numCnInCnGroups[1]*Z + 31)>>5;
// Set the offset to each bit within a group in terms of 32 Byte
bitOffsetInGroup = (lut_numCnInCnGroups_BG1_R13[1]*NR_LDPC_ZMAX)>>5;
// Set pointers to start of group 4
//p_cnProcBuf = (__m256i*) &cnProcBuf [lut_startAddrCnGroups[1]];
//p_cnProcBufRes = (__m256i*) &cnProcBufRes[lut_startAddrCnGroups[1]];
// Loop over every BN
int iprime=0;
for (j=0; j<4; j++)
{
// Set of results pointer to correct BN address
//p_cnProcBufResBit = p_cnProcBufRes + (j*bitOffsetInGroup);
// Loop over CNs
// for (i=0; i<M; i++,iprime++)
// {
fprintf(fd," for (int i=0;i<%d;i++) {\n",M);
// Abs and sign of 32 CNs (first BN)
// ymm0 = p_cnProcBuf[lut_idxCnProcG3[j][0] + i];
fprintf(fd," ymm0 = ((__m256i*)cnProcBuf)[%d+i];\n",(lut_startAddrCnGroups[1]>>5)+lut_idxCnProcG4[j][0]);
// sgn = _mm256_sign_epi8(ones, ymm0);
fprintf(fd," sgn = _mm256_sign_epi8(ones, ymm0);\n");
// min = _mm256_abs_epi8(ymm0);
fprintf(fd," min = _mm256_abs_epi8(ymm0);\n");
// Loop over BNs
for (k=1; k<3; k++)
{
fprintf(fd," ymm0 = ((__m256i*)cnProcBuf)[%d+i];\n",(lut_startAddrCnGroups[1]>>5)+lut_idxCnProcG4[j][k]);
// min = _mm256_min_epu8(min, _mm256_abs_epi8(ymm0));
fprintf(fd," min = _mm256_min_epu8(min, _mm256_abs_epi8(ymm0));\n");
// sgn = _mm256_sign_epi8(sgn, ymm0);
fprintf(fd," sgn = _mm256_sign_epi8(sgn, ymm0);\n");
}
// Store result
// min = _mm256_min_epu8(min, maxLLR); // 128 in epi8 is -127
fprintf(fd," min = _mm256_min_epu8(min, maxLLR);\n");
// *p_cnProcBufResBit = _mm256_sign_epi8(min, sgn);
// p_cnProcBufResBit++;
fprintf(fd," ((__m256i*)cnProcBufRes)[%d+i] = _mm256_sign_epi8(min, sgn);\n",(lut_startAddrCnGroups[1]>>5)+(j*bitOffsetInGroup));
fprintf(fd," }\n");
}
}
// =====================================================================
// Process group with 5 BNs
fprintf(fd,"//Process group with 5 BNs\n");
// Offset is 18*384/32 = 216
const uint16_t lut_idxCnProcG5[5][4] = {{216,432,648,864}, {0,432,648,864},
{0,216,648,864}, {0,216,432,864}, {0,216,432,648}};
if (lut_numCnInCnGroups[2] > 0)
{
// Number of groups of 32 CNs for parallel processing
// Ceil for values not divisible by 32
M = (lut_numCnInCnGroups[2]*Z + 31)>>5;
// Set the offset to each bit within a group in terms of 32 Byte
bitOffsetInGroup = (lut_numCnInCnGroups_BG1_R13[2]*NR_LDPC_ZMAX)>>5;
// Set pointers to start of group 4
//p_cnProcBuf = (__m256i*) &cnProcBuf [lut_startAddrCnGroups[1]];
//p_cnProcBufRes = (__m256i*) &cnProcBufRes[lut_startAddrCnGroups[1]];
// Loop over every BN
int iprime=0;
for (j=0; j<5; j++)
{
// Set of results pointer to correct BN address
//p_cnProcBufResBit = p_cnProcBufRes + (j*bitOffsetInGroup);
// Loop over CNs
// for (i=0; i<M; i++,iprime++)
// {
fprintf(fd," for (int i=0;i<%d;i++) {\n",M);
// Abs and sign of 32 CNs (first BN)
// ymm0 = p_cnProcBuf[lut_idxCnProcG3[j][0] + i];
fprintf(fd," ymm0 = ((__m256i*)cnProcBuf)[%d+i];\n",(lut_startAddrCnGroups[2]>>5)+lut_idxCnProcG5[j][0]);
// sgn = _mm256_sign_epi8(ones, ymm0);
fprintf(fd," sgn = _mm256_sign_epi8(ones, ymm0);\n");
// min = _mm256_abs_epi8(ymm0);
fprintf(fd," min = _mm256_abs_epi8(ymm0);\n");
// Loop over BNs
for (k=1; k<4; k++)
{
fprintf(fd," ymm0 = ((__m256i*)cnProcBuf)[%d+i];\n",(lut_startAddrCnGroups[2]>>5)+lut_idxCnProcG5[j][k]);
// min = _mm256_min_epu8(min, _mm256_abs_epi8(ymm0));
fprintf(fd," min = _mm256_min_epu8(min, _mm256_abs_epi8(ymm0));\n");
// sgn = _mm256_sign_epi8(sgn, ymm0);
fprintf(fd," sgn = _mm256_sign_epi8(sgn, ymm0);\n");
}
// Store result
// min = _mm256_min_epu8(min, maxLLR); // 128 in epi8 is -127
fprintf(fd," min = _mm256_min_epu8(min, maxLLR);\n");
// *p_cnProcBufResBit = _mm256_sign_epi8(min, sgn);
// p_cnProcBufResBit++;
fprintf(fd," ((__m256i*)cnProcBufRes)[%d+i] = _mm256_sign_epi8(min, sgn);\n",(lut_startAddrCnGroups[2]>>5)+(j*bitOffsetInGroup));
fprintf(fd," }\n");
}
}
// =====================================================================
// Process group with 6 BNs
fprintf(fd,"//Process group with 6 BNs\n");
// Offset is 8*384/32 = 96
const uint16_t lut_idxCnProcG6[6][5] = {{96,192,288,384,480}, {0,192,288,384,480},
{0,96,288,384,480}, {0,96,192,384,480},
{0,96,192,288,480}, {0,96,192,288,384}};
if (lut_numCnInCnGroups[3] > 0)
{
// Number of groups of 32 CNs for parallel processing
// Ceil for values not divisible by 32
M = (lut_numCnInCnGroups[3]*Z + 31)>>5;
// Set the offset to each bit within a group in terms of 32 Byte
bitOffsetInGroup = (lut_numCnInCnGroups_BG1_R13[3]*NR_LDPC_ZMAX)>>5;
// Set pointers to start of group 4
//p_cnProcBuf = (__m256i*) &cnProcBuf [lut_startAddrCnGroups[1]];
//p_cnProcBufRes = (__m256i*) &cnProcBufRes[lut_startAddrCnGroups[1]];
// Loop over every BN
int iprime=0;
for (j=0; j<6; j++)
{
// Set of results pointer to correct BN address
//p_cnProcBufResBit = p_cnProcBufRes + (j*bitOffsetInGroup);
// Loop over CNs
// for (i=0; i<M; i++,iprime++)
// {
fprintf(fd," for (int i=0;i<%d;i++) {\n",M);
// Abs and sign of 32 CNs (first BN)
// ymm0 = p_cnProcBuf[lut_idxCnProcG3[j][0] + i];
fprintf(fd," ymm0 = ((__m256i*)cnProcBuf)[%d+i];\n",(lut_startAddrCnGroups[3]>>5)+lut_idxCnProcG6[j][0]);
// sgn = _mm256_sign_epi8(ones, ymm0);
fprintf(fd," sgn = _mm256_sign_epi8(ones, ymm0);\n");
// min = _mm256_abs_epi8(ymm0);
fprintf(fd," min = _mm256_abs_epi8(ymm0);\n");
// Loop over BNs
for (k=1; k<5; k++)
{
fprintf(fd," ymm0 = ((__m256i*)cnProcBuf)[%d+i];\n",(lut_startAddrCnGroups[3]>>5)+lut_idxCnProcG6[j][k]);
// min = _mm256_min_epu8(min, _mm256_abs_epi8(ymm0));
fprintf(fd," min = _mm256_min_epu8(min, _mm256_abs_epi8(ymm0));\n");
// sgn = _mm256_sign_epi8(sgn, ymm0);
fprintf(fd," sgn = _mm256_sign_epi8(sgn, ymm0);\n");
}
// Store result
// min = _mm256_min_epu8(min, maxLLR); // 128 in epi8 is -127
fprintf(fd," min = _mm256_min_epu8(min, maxLLR);\n");
// *p_cnProcBufResBit = _mm256_sign_epi8(min, sgn);
// p_cnProcBufResBit++;
fprintf(fd," ((__m256i*)cnProcBufRes)[%d+i] = _mm256_sign_epi8(min, sgn);\n",(lut_startAddrCnGroups[3]>>5)+(j*bitOffsetInGroup));
fprintf(fd," }\n");
}
}
// =====================================================================
// Process group with 7 BNs
fprintf(fd,"//Process group with 7 BNs\n");
// Offset is 5*384/32 = 60
const uint16_t lut_idxCnProcG7[7][6] = {{60,120,180,240,300,360}, {0,120,180,240,300,360},
{0,60,180,240,300,360}, {0,60,120,240,300,360},
{0,60,120,180,300,360}, {0,60,120,180,240,360},
{0,60,120,180,240,300}};
if (lut_numCnInCnGroups[4] > 0)
{
// Number of groups of 32 CNs for parallel processing
// Ceil for values not divisible by 32
M = (lut_numCnInCnGroups[4]*Z + 31)>>5;
// Set the offset to each bit within a group in terms of 32 Byte
bitOffsetInGroup = (lut_numCnInCnGroups_BG1_R13[4]*NR_LDPC_ZMAX)>>5;
// Set pointers to start of group 4
//p_cnProcBuf = (__m256i*) &cnProcBuf [lut_startAddrCnGroups[1]];
//p_cnProcBufRes = (__m256i*) &cnProcBufRes[lut_startAddrCnGroups[1]];
// Loop over every BN
int iprime=0;
for (j=0; j<7; j++)
{
// Set of results pointer to correct BN address
//p_cnProcBufResBit = p_cnProcBufRes + (j*bitOffsetInGroup);
// Loop over CNs
// for (i=0; i<M; i++,iprime++)
// {
fprintf(fd," for (int i=0;i<%d;i++) {\n",M);
// Abs and sign of 32 CNs (first BN)
// ymm0 = p_cnProcBuf[lut_idxCnProcG3[j][0] + i];
fprintf(fd," ymm0 = ((__m256i*)cnProcBuf)[%d+i];\n",(lut_startAddrCnGroups[4]>>5)+lut_idxCnProcG7[j][0]);
// sgn = _mm256_sign_epi8(ones, ymm0);
fprintf(fd," sgn = _mm256_sign_epi8(ones, ymm0);\n");
// min = _mm256_abs_epi8(ymm0);
fprintf(fd," min = _mm256_abs_epi8(ymm0);\n");
// Loop over BNs
for (k=1; k<6; k++)
{
fprintf(fd," ymm0 = ((__m256i*)cnProcBuf)[%d+i];\n",(lut_startAddrCnGroups[4]>>5)+lut_idxCnProcG7[j][k]);
// min = _mm256_min_epu8(min, _mm256_abs_epi8(ymm0));
fprintf(fd," min = _mm256_min_epu8(min, _mm256_abs_epi8(ymm0));\n");
// sgn = _mm256_sign_epi8(sgn, ymm0);
fprintf(fd," sgn = _mm256_sign_epi8(sgn, ymm0);\n");
}
// Store result
// min = _mm256_min_epu8(min, maxLLR); // 128 in epi8 is -127
fprintf(fd," min = _mm256_min_epu8(min, maxLLR);\n");
// *p_cnProcBufResBit = _mm256_sign_epi8(min, sgn);
// p_cnProcBufResBit++;
fprintf(fd," ((__m256i*)cnProcBufRes)[%d+i] = _mm256_sign_epi8(min, sgn);\n",(lut_startAddrCnGroups[4]>>5)+(j*bitOffsetInGroup));
fprintf(fd," }\n");
}
}
// =====================================================================
// Process group with 8 BNs
fprintf(fd,"//Process group with 8 BNs\n");
// Offset is 2*384/32 = 24
const uint8_t lut_idxCnProcG8[8][7] = {{24,48,72,96,120,144,168}, {0,48,72,96,120,144,168},
{0,24,72,96,120,144,168}, {0,24,48,96,120,144,168},
{0,24,48,72,120,144,168}, {0,24,48,72,96,144,168},
{0,24,48,72,96,120,168}, {0,24,48,72,96,120,144}};
if (lut_numCnInCnGroups[5] > 0)
{
// Number of groups of 32 CNs for parallel processing
// Ceil for values not divisible by 32
M = (lut_numCnInCnGroups[5]*Z + 31)>>5;
// Set the offset to each bit within a group in terms of 32 Byte
bitOffsetInGroup = (lut_numCnInCnGroups_BG1_R13[5]*NR_LDPC_ZMAX)>>5;
// Set pointers to start of group 4
//p_cnProcBuf = (__m256i*) &cnProcBuf [lut_startAddrCnGroups[1]];
//p_cnProcBufRes = (__m256i*) &cnProcBufRes[lut_startAddrCnGroups[1]];
// Loop over every BN
int iprime=0;
for (j=0; j<8; j++)
{
// Set of results pointer to correct BN address
//p_cnProcBufResBit = p_cnProcBufRes + (j*bitOffsetInGroup);
// Loop over CNs
// for (i=0; i<M; i++,iprime++)
// {
fprintf(fd," for (int i=0;i<%d;i++) {\n",M);
// Abs and sign of 32 CNs (first BN)
// ymm0 = p_cnProcBuf[lut_idxCnProcG3[j][0] + i];
fprintf(fd," ymm0 = ((__m256i*)cnProcBuf)[%d+i];\n",(lut_startAddrCnGroups[5]>>5)+lut_idxCnProcG8[j][0]);
// sgn = _mm256_sign_epi8(ones, ymm0);
fprintf(fd," sgn = _mm256_sign_epi8(ones, ymm0);\n");
// min = _mm256_abs_epi8(ymm0);
fprintf(fd," min = _mm256_abs_epi8(ymm0);\n");
// Loop over BNs
for (k=1; k<7; k++)
{
fprintf(fd," ymm0 = ((__m256i*)cnProcBuf)[%d+i];\n",(lut_startAddrCnGroups[5]>>5)+lut_idxCnProcG8[j][k]);
// min = _mm256_min_epu8(min, _mm256_abs_epi8(ymm0));
fprintf(fd," min = _mm256_min_epu8(min, _mm256_abs_epi8(ymm0));\n");
// sgn = _mm256_sign_epi8(sgn, ymm0);
fprintf(fd," sgn = _mm256_sign_epi8(sgn, ymm0);\n");
}
// Store result
// min = _mm256_min_epu8(min, maxLLR); // 128 in epi8 is -127
fprintf(fd," min = _mm256_min_epu8(min, maxLLR);\n");
// *p_cnProcBufResBit = _mm256_sign_epi8(min, sgn);
// p_cnProcBufResBit++;
fprintf(fd," ((__m256i*)cnProcBufRes)[%d+i] = _mm256_sign_epi8(min, sgn);\n",(lut_startAddrCnGroups[5]>>5)+(j*bitOffsetInGroup));
fprintf(fd," }\n");
}
}
// =====================================================================
// Process group with 9 BNs
fprintf(fd,"//Process group with 9 BNs\n");
// Offset is 2*384/32 = 24
const uint8_t lut_idxCnProcG9[9][8] = {{24,48,72,96,120,144,168,192}, {0,48,72,96,120,144,168,192},
{0,24,72,96,120,144,168,192}, {0,24,48,96,120,144,168,192},
{0,24,48,72,120,144,168,192}, {0,24,48,72,96,144,168,192},
{0,24,48,72,96,120,168,192}, {0,24,48,72,96,120,144,192},
{0,24,48,72,96,120,144,168}};
if (lut_numCnInCnGroups[6] > 0)
{
// Number of groups of 32 CNs for parallel processing
// Ceil for values not divisible by 32
M = (lut_numCnInCnGroups[6]*Z + 31)>>5;
// Set the offset to each bit within a group in terms of 32 Byte
bitOffsetInGroup = (lut_numCnInCnGroups_BG1_R13[6]*NR_LDPC_ZMAX)>>5;
// Set pointers to start of group 9
//p_cnProcBuf = (__m256i*) &cnProcBuf [lut_startAddrCnGroups[1]];
//p_cnProcBufRes = (__m256i*) &cnProcBufRes[lut_startAddrCnGroups[1]];
// Loop over every BN
int iprime=0;
for (j=0; j<9; j++)
{
// Set of results pointer to correct BN address
//p_cnProcBufResBit = p_cnProcBufRes + (j*bitOffsetInGroup);
// Loop over CNs
// for (i=0; i<M; i++,iprime++)
// {
fprintf(fd," for (int i=0;i<%d;i++) {\n",M);
// Abs and sign of 32 CNs (first BN)
// ymm0 = p_cnProcBuf[lut_idxCnProcG3[j][0] + i];
fprintf(fd," ymm0 = ((__m256i*)cnProcBuf)[%d+i];\n",(lut_startAddrCnGroups[6]>>5)+lut_idxCnProcG9[j][0]);
// sgn = _mm256_sign_epi8(ones, ymm0);
fprintf(fd," sgn = _mm256_sign_epi8(ones, ymm0);\n");
// min = _mm256_abs_epi8(ymm0);
fprintf(fd," min = _mm256_abs_epi8(ymm0);\n");
// Loop over BNs
for (k=1; k<8; k++)
{
fprintf(fd," ymm0 = ((__m256i*)cnProcBuf)[%d+i];\n",(lut_startAddrCnGroups[6]>>5)+lut_idxCnProcG9[j][k]);
// min = _mm256_min_epu8(min, _mm256_abs_epi8(ymm0));
fprintf(fd," min = _mm256_min_epu8(min, _mm256_abs_epi8(ymm0));\n");
// sgn = _mm256_sign_epi8(sgn, ymm0);
fprintf(fd," sgn = _mm256_sign_epi8(sgn, ymm0);\n");
}
// Store result
// min = _mm256_min_epu8(min, maxLLR); // 128 in epi8 is -127
fprintf(fd," min = _mm256_min_epu8(min, maxLLR);\n");
// *p_cnProcBufResBit = _mm256_sign_epi8(min, sgn);
// p_cnProcBufResBit++;
fprintf(fd," ((__m256i*)cnProcBufRes)[%d+i] = _mm256_sign_epi8(min, sgn);\n",(lut_startAddrCnGroups[6]>>5)+(j*bitOffsetInGroup));
fprintf(fd," }\n");
}
}
// =====================================================================
// Process group with 10 BNs
fprintf(fd,"//Process group with 10 BNs\n");
// Offset is 1*384/32 = 12
const uint8_t lut_idxCnProcG10[10][9] = {{12,24,36,48,60,72,84,96,108}, {0,24,36,48,60,72,84,96,108},
{0,12,36,48,60,72,84,96,108}, {0,12,24,48,60,72,84,96,108},
{0,12,24,36,60,72,84,96,108}, {0,12,24,36,48,72,84,96,108},
{0,12,24,36,48,60,84,96,108}, {0,12,24,36,48,60,72,96,108},
{0,12,24,36,48,60,72,84,108}, {0,12,24,36,48,60,72,84,96}};
if (lut_numCnInCnGroups[7] > 0)
{
// Number of groups of 32 CNs for parallel processing
// Ceil for values not divisible by 32
M = (lut_numCnInCnGroups[7]*Z + 31)>>5;
// Set the offset to each bit within a group in terms of 32 Byte
bitOffsetInGroup = (lut_numCnInCnGroups_BG1_R13[7]*NR_LDPC_ZMAX)>>5;
// Set pointers to start of group 10
//p_cnProcBuf = (__m256i*) &cnProcBuf [lut_startAddrCnGroups[1]];
//p_cnProcBufRes = (__m256i*) &cnProcBufRes[lut_startAddrCnGroups[1]];
// Loop over every BN
int iprime=0;
for (j=0; j<10; j++)
{
// Set of results pointer to correct BN address
//p_cnProcBufResBit = p_cnProcBufRes + (j*bitOffsetInGroup);
// Loop over CNs
// for (i=0; i<M; i++,iprime++)
// {
fprintf(fd," for (int i=0;i<%d;i++) {\n",M);
// Abs and sign of 32 CNs (first BN)
// ymm0 = p_cnProcBuf[lut_idxCnProcG3[j][0] + i];
fprintf(fd," ymm0 = ((__m256i*)cnProcBuf)[%d+i];\n",(lut_startAddrCnGroups[7]>>5)+lut_idxCnProcG10[j][0]);
// sgn = _mm256_sign_epi8(ones, ymm0);
fprintf(fd," sgn = _mm256_sign_epi8(ones, ymm0);\n");
// min = _mm256_abs_epi8(ymm0);
fprintf(fd," min = _mm256_abs_epi8(ymm0);\n");
// Loop over BNs
for (k=1; k<9; k++)
{
fprintf(fd," ymm0 = ((__m256i*)cnProcBuf)[%d+i];\n",(lut_startAddrCnGroups[7]>>5)+lut_idxCnProcG10[j][k]);
// min = _mm256_min_epu8(min, _mm256_abs_epi8(ymm0));
fprintf(fd," min = _mm256_min_epu8(min, _mm256_abs_epi8(ymm0));\n");
// sgn = _mm256_sign_epi8(sgn, ymm0);
fprintf(fd," sgn = _mm256_sign_epi8(sgn, ymm0);\n");
}
// Store result
// min = _mm256_min_epu8(min, maxLLR); // 128 in epi8 is -127
fprintf(fd," min = _mm256_min_epu8(min, maxLLR);\n");
// *p_cnProcBufResBit = _mm256_sign_epi8(min, sgn);
// p_cnProcBufResBit++;
fprintf(fd," ((__m256i*)cnProcBufRes)[%d+i] = _mm256_sign_epi8(min, sgn);\n",(lut_startAddrCnGroups[7]>>5)+(j*bitOffsetInGroup));
fprintf(fd," }\n");
}
}
// =====================================================================
// Process group with 19 BNs
fprintf(fd,"//Process group with 19 BNs\n");
// Offset is 4*384/32 = 12
const uint16_t lut_idxCnProcG19[19][18] = {{48,96,144,192,240,288,336,384,432,480,528,576,624,672,720,768,816,864}, {0,96,144,192,240,288,336,384,432,480,528,576,624,672,720,768,816,864},
{0,48,144,192,240,288,336,384,432,480,528,576,624,672,720,768,816,864}, {0,48,96,192,240,288,336,384,432,480,528,576,624,672,720,768,816,864},
{0,48,96,144,240,288,336,384,432,480,528,576,624,672,720,768,816,864}, {0,48,96,144,192,288,336,384,432,480,528,576,624,672,720,768,816,864},
{0,48,96,144,192,240,336,384,432,480,528,576,624,672,720,768,816,864}, {0,48,96,144,192,240,288,384,432,480,528,576,624,672,720,768,816,864},
{0,48,96,144,192,240,288,336,432,480,528,576,624,672,720,768,816,864}, {0,48,96,144,192,240,288,336,384,480,528,576,624,672,720,768,816,864},
{0,48,96,144,192,240,288,336,384,432,528,576,624,672,720,768,816,864}, {0,48,96,144,192,240,288,336,384,432,480,576,624,672,720,768,816,864},
{0,48,96,144,192,240,288,336,384,432,480,528,624,672,720,768,816,864}, {0,48,96,144,192,240,288,336,384,432,480,528,576,672,720,768,816,864},
{0,48,96,144,192,240,288,336,384,432,480,528,576,624,720,768,816,864}, {0,48,96,144,192,240,288,336,384,432,480,528,576,624,672,768,816,864},
{0,48,96,144,192,240,288,336,384,432,480,528,576,624,672,720,816,864}, {0,48,96,144,192,240,288,336,384,432,480,528,576,624,672,720,768,864},
{0,48,96,144,192,240,288,336,384,432,480,528,576,624,672,720,768,816}};
if (lut_numCnInCnGroups[8] > 0)
{
// Number of groups of 32 CNs for parallel processing
// Ceil for values not divisible by 32
M = (lut_numCnInCnGroups[8]*Z + 31)>>5;
// Set the offset to each bit within a group in terms of 32 Byte
bitOffsetInGroup = (lut_numCnInCnGroups_BG1_R13[8]*NR_LDPC_ZMAX)>>5;
// Set pointers to start of group 19
//p_cnProcBuf = (__m256i*) &cnProcBuf [lut_startAddrCnGroups[1]];
//p_cnProcBufRes = (__m256i*) &cnProcBufRes[lut_startAddrCnGroups[1]];
// Loop over every BN
int iprime=0;
for (j=0; j<19; j++)
{
// Set of results pointer to correct BN address
//p_cnProcBufResBit = p_cnProcBufRes + (j*bitOffsetInGroup);
// Loop over CNs
// for (i=0; i<M; i++,iprime++)
// {
fprintf(fd," for (int i=0;i<%d;i++) {\n",M);
// Abs and sign of 32 CNs (first BN)
// ymm0 = p_cnProcBuf[lut_idxCnProcG3[j][0] + i];
fprintf(fd," ymm0 = ((__m256i*)cnProcBuf)[%d+i];\n",(lut_startAddrCnGroups[8]>>5)+lut_idxCnProcG19[j][0]);
// sgn = _mm256_sign_epi8(ones, ymm0);
fprintf(fd," sgn = _mm256_sign_epi8(ones, ymm0);\n");
// min = _mm256_abs_epi8(ymm0);
fprintf(fd," min = _mm256_abs_epi8(ymm0);\n");
// Loop over BNs
for (k=1; k<18; k++)
{
fprintf(fd," ymm0 = ((__m256i*)cnProcBuf)[%d+i];\n",(lut_startAddrCnGroups[8]>>5)+lut_idxCnProcG19[j][k]);
// min = _mm256_min_epu8(min, _mm256_abs_epi8(ymm0));
fprintf(fd," min = _mm256_min_epu8(min, _mm256_abs_epi8(ymm0));\n");
// sgn = _mm256_sign_epi8(sgn, ymm0);
fprintf(fd," sgn = _mm256_sign_epi8(sgn, ymm0);\n");
}
// Store result
// min = _mm256_min_epu8(min, maxLLR); // 128 in epi8 is -127
fprintf(fd," min = _mm256_min_epu8(min, maxLLR);\n");
// *p_cnProcBufResBit = _mm256_sign_epi8(min, sgn);
// p_cnProcBufResBit++;
fprintf(fd," ((__m256i*)cnProcBufRes)[%d+i] = _mm256_sign_epi8(min, sgn);\n",(lut_startAddrCnGroups[8]>>5)+(j*bitOffsetInGroup));
fprintf(fd," }\n");
}
}
fprintf(fd,"}\n");
fclose(fd);
}//end of the function nrLDPC_cnProc_BG1
#ifndef NRLDPC_CN_GEN
#define NRLDPC_CN_GEN
void nrLDPC_cnProc_BG1_generator(uint16_t Z,int R);
void nrLDPC_cnProc_BG1_generator_AVX2(uint16_t Z,int R);
#endif
#include <stdio.h>
#include <immintrin.h>
#include "../../nrLDPC_types.h"
#include "../../nrLDPC_init.h"
#include "../../nrLDPC_bnProc.h"
#include "cnProc_gen_avx2.h"
int main(int argc, char *argv [])
{
// Z=384, R=1/3
nrLDPC_cnProc_BG1_generator_AVX2(384,0);
return(0);
}
C=gcc
CFLAGS=-W -Wall
LDFLAGS=
EXEC=cnProc_gen_avx512
SRC= $(wildcard *.c)
OBJ= $(SRC:.c=.o)
all: $(EXEC)
cnProc_gen_avx512: $(OBJ)
$(CC) -o $@ $^ $(LDFLAGS) -O2 -pg
main.o: cnProc_gen_avx512.h
%.o: %.c
$(CC) -o $@ -c $< $(CFLAGS) -I ${OPENAIR_HOME}/openair1 -g -pg
.PHONY: clean mrproper
clean:
rm -rf *.o
mrproper: clean
rm -rf $(EXEC)
zip:
tar -zcvf sauvegarde.tar.gz main.c cnProc_gen_avx512.c cnProc_gen_avx512.h Makefile
#include <stdint.h>
#include <immintrin.h>
#include "../nrLDPCdecoder_defs.h"
#include "../nrLDPC_types.h"
#include "../nrLDPC_bnProc.h"
#include "cnProc_gen.h"
#include "../../nrLDPCdecoder_defs.h"
#include "../../nrLDPC_types.h"
#include "../../nrLDPC_bnProc.h"
#include "cnProc_gen_avx512.h"
void nrLDPC_cnProc_BG1_generator(uint16_t Z,int R)
void nrLDPC_cnProc_BG1_generator_AVX512(uint16_t Z,int R)
{
const char *ratestr[3]={"13","23","89"};
if (R<0 || R>2) {printf("Illegal R %d\n",R); abort();}
system("mkdir -p ldpc_gen_files");
// system("mkdir -p ldpc_gen_files");
char fname[50];
sprintf(fname,"ldpc_gen_files/nrLDPC_cnProc_BG1_Z%d_%s.c",Z,ratestr[R]);
sprintf(fname,"../ldpc_gen_files/nrLDPC_cnProc_BG1_Z%d_%s_AVX512.c",Z,ratestr[R]);
FILE *fd=fopen(fname,"w");
if (fd == NULL) {printf("Cannot create %s\n");abort();}
......@@ -31,7 +31,7 @@ void nrLDPC_cnProc_BG1_generator(uint16_t Z,int R)
fprintf(fd, "}\n" );
fprintf(fd,"void nrLDPC_cnProc_BG1_Z%d_%s(int8_t* cnProcBuf,int8_t* cnProcBufRes) {\n",Z,ratestr[R]);
fprintf(fd,"void nrLDPC_cnProc_BG1_Z%d_%s_AVX512(int8_t* cnProcBuf,int8_t* cnProcBufRes) {\n",Z,ratestr[R]);
const uint8_t* lut_numCnInCnGroups;
const uint32_t* lut_startAddrCnGroups = lut_startAddrCnGroups_BG1;
......@@ -48,7 +48,7 @@ void nrLDPC_cnProc_BG1_generator(uint16_t Z,int R)
uint32_t M;
uint32_t j;
uint32_t k;
// Offset to each bit within a group in terms of 32 Byte
// Offset to each bit within a group in terms of 64 Byte
uint32_t bitOffsetInGroup;
//__m512i zmm0, min, sgn;
......@@ -79,11 +79,11 @@ void nrLDPC_cnProc_BG1_generator(uint16_t Z,int R)
if (lut_numCnInCnGroups[0] > 0)
{
// Number of groups of 32 CNs for parallel processing
// Number of groups of 64 CNs for parallel processing
// Ceil for values not divisible by 64
M = (lut_numCnInCnGroups[0]*Z + 63)>>5;
// Set the offset to each bit within a group in terms of 32 Byte
// Set the offset to each bit within a group in terms of 64 Byte
bitOffsetInGroup = (lut_numCnInCnGroups_BG1_R13[0]*NR_LDPC_ZMAX)>>5;
......@@ -103,7 +103,7 @@ void nrLDPC_cnProc_BG1_generator(uint16_t Z,int R)
// {
fprintf(fd," for (int i=0;i<%d;i+=2) {\n",M);
// Abs and sign of 32 CNs (first BN)
// Abs and sign of 64 CNs (first BN)
// zmm0 = p_cnProcBuf[lut_idxCnProcG3[j][0] + i];
fprintf(fd," zmm0 = ((__m512i*)cnProcBuf)[%d+i];\n",(lut_startAddrCnGroups[0]>>5)+lut_idxCnProcG3[j][0]);
// sgn = _mm512_sign_epi16(ones, zmm0);
......@@ -128,7 +128,7 @@ void nrLDPC_cnProc_BG1_generator(uint16_t Z,int R)
// p_cnProcBufResBit++;
fprintf(fd," ((__m512i*)cnProcBufRes)[%d+i] = _mm512_sign_epi16(min, sgn);\n",(lut_startAddrCnGroups[0]>>5)+(j*bitOffsetInGroup));
// Abs and sign of 32 CNs (first BN)
// Abs and sign of 64 CNs (first BN)
// zmm0 = p_cnProcBuf[lut_idxCnProcG3[j][0] + i];
fprintf(fd," zmm0 = ((__m512i*)cnProcBuf)[%d+i];\n",(lut_startAddrCnGroups[0]>>5)+lut_idxCnProcG3[j][0]+1);
// sgn = _mm512_sign_epi16(ones, zmm0);
......@@ -165,11 +165,11 @@ void nrLDPC_cnProc_BG1_generator(uint16_t Z,int R)
if (lut_numCnInCnGroups[1] > 0)
{
// Number of groups of 32 CNs for parallel processing
// Number of groups of 64 CNs for parallel processing
// Ceil for values not divisible by 64
M = (lut_numCnInCnGroups[1]*Z + 63)>>5;
// Set the offset to each bit within a group in terms of 32 Byte
// Set the offset to each bit within a group in terms of 64 Byte
bitOffsetInGroup = (lut_numCnInCnGroups_BG1_R13[1]*NR_LDPC_ZMAX)>>5;
......@@ -188,7 +188,7 @@ void nrLDPC_cnProc_BG1_generator(uint16_t Z,int R)
// for (i=0; i<M; i++,iprime++)
// {
fprintf(fd," for (int i=0;i<%d;i++) {\n",M);
// Abs and sign of 32 CNs (first BN)
// Abs and sign of 64 CNs (first BN)
// zmm0 = p_cnProcBuf[lut_idxCnProcG3[j][0] + i];
fprintf(fd," zmm0 = ((__m512i*)cnProcBuf)[%d+i];\n",(lut_startAddrCnGroups[1]>>5)+lut_idxCnProcG4[j][0]);
// sgn = _mm512_sign_epi16(ones, zmm0);
......@@ -230,11 +230,11 @@ void nrLDPC_cnProc_BG1_generator(uint16_t Z,int R)
if (lut_numCnInCnGroups[2] > 0)
{
// Number of groups of 32 CNs for parallel processing
// Number of groups of 64 CNs for parallel processing
// Ceil for values not divisible by 64
M = (lut_numCnInCnGroups[2]*Z + 63)>>5;
// Set the offset to each bit within a group in terms of 32 Byte
// Set the offset to each bit within a group in terms of 64 Byte
bitOffsetInGroup = (lut_numCnInCnGroups_BG1_R13[2]*NR_LDPC_ZMAX)>>5;
......@@ -253,7 +253,7 @@ void nrLDPC_cnProc_BG1_generator(uint16_t Z,int R)
// for (i=0; i<M; i++,iprime++)
// {
fprintf(fd," for (int i=0;i<%d;i++) {\n",M);
// Abs and sign of 32 CNs (first BN)
// Abs and sign of 64 CNs (first BN)
// zmm0 = p_cnProcBuf[lut_idxCnProcG3[j][0] + i];
fprintf(fd," zmm0 = ((__m512i*)cnProcBuf)[%d+i];\n",(lut_startAddrCnGroups[2]>>5)+lut_idxCnProcG5[j][0]);
// sgn = _mm512_sign_epi16(ones, zmm0);
......@@ -295,11 +295,11 @@ void nrLDPC_cnProc_BG1_generator(uint16_t Z,int R)
if (lut_numCnInCnGroups[3] > 0)
{
// Number of groups of 32 CNs for parallel processing
// Number of groups of 64 CNs for parallel processing
// Ceil for values not divisible by 64
M = (lut_numCnInCnGroups[3]*Z + 63)>>5;
// Set the offset to each bit within a group in terms of 32 Byte
// Set the offset to each bit within a group in terms of 64 Byte
bitOffsetInGroup = (lut_numCnInCnGroups_BG1_R13[3]*NR_LDPC_ZMAX)>>5;
......@@ -318,7 +318,7 @@ void nrLDPC_cnProc_BG1_generator(uint16_t Z,int R)
// for (i=0; i<M; i++,iprime++)
// {
fprintf(fd," for (int i=0;i<%d;i++) {\n",M);
// Abs and sign of 32 CNs (first BN)
// Abs and sign of 64 CNs (first BN)
// zmm0 = p_cnProcBuf[lut_idxCnProcG3[j][0] + i];
fprintf(fd," zmm0 = ((__m512i*)cnProcBuf)[%d+i];\n",(lut_startAddrCnGroups[3]>>5)+lut_idxCnProcG6[j][0]);
// sgn = _mm512_sign_epi16(ones, zmm0);
......@@ -362,11 +362,11 @@ void nrLDPC_cnProc_BG1_generator(uint16_t Z,int R)
if (lut_numCnInCnGroups[4] > 0)
{
// Number of groups of 32 CNs for parallel processing
// Number of groups of 64 CNs for parallel processing
// Ceil for values not divisible by 64
M = (lut_numCnInCnGroups[4]*Z + 63)>>5;
// Set the offset to each bit within a group in terms of 32 Byte
// Set the offset to each bit within a group in terms of 64 Byte
bitOffsetInGroup = (lut_numCnInCnGroups_BG1_R13[4]*NR_LDPC_ZMAX)>>5;
......@@ -385,7 +385,7 @@ void nrLDPC_cnProc_BG1_generator(uint16_t Z,int R)
// for (i=0; i<M; i++,iprime++)
// {
fprintf(fd," for (int i=0;i<%d;i++) {\n",M);
// Abs and sign of 32 CNs (first BN)
// Abs and sign of 64 CNs (first BN)
// zmm0 = p_cnProcBuf[lut_idxCnProcG3[j][0] + i];
fprintf(fd," zmm0 = ((__m512i*)cnProcBuf)[%d+i];\n",(lut_startAddrCnGroups[4]>>5)+lut_idxCnProcG7[j][0]);
// sgn = _mm512_sign_epi16(ones, zmm0);
......@@ -430,11 +430,11 @@ void nrLDPC_cnProc_BG1_generator(uint16_t Z,int R)
if (lut_numCnInCnGroups[5] > 0)
{
// Number of groups of 32 CNs for parallel processing
// Number of groups of 64 CNs for parallel processing
// Ceil for values not divisible by 64
M = (lut_numCnInCnGroups[5]*Z + 63)>>5;
// Set the offset to each bit within a group in terms of 32 Byte
// Set the offset to each bit within a group in terms of 64 Byte
bitOffsetInGroup = (lut_numCnInCnGroups_BG1_R13[5]*NR_LDPC_ZMAX)>>5;
......@@ -453,7 +453,7 @@ void nrLDPC_cnProc_BG1_generator(uint16_t Z,int R)
// for (i=0; i<M; i++,iprime++)
// {
fprintf(fd," for (int i=0;i<%d;i++) {\n",M);
// Abs and sign of 32 CNs (first BN)
// Abs and sign of 64 CNs (first BN)
// zmm0 = p_cnProcBuf[lut_idxCnProcG3[j][0] + i];
fprintf(fd," zmm0 = ((__m512i*)cnProcBuf)[%d+i];\n",(lut_startAddrCnGroups[5]>>5)+lut_idxCnProcG8[j][0]);
// sgn = _mm512_sign_epi16(ones, zmm0);
......@@ -499,11 +499,11 @@ void nrLDPC_cnProc_BG1_generator(uint16_t Z,int R)
if (lut_numCnInCnGroups[6] > 0)
{
// Number of groups of 32 CNs for parallel processing
// Number of groups of 64 CNs for parallel processing
// Ceil for values not divisible by 64
M = (lut_numCnInCnGroups[6]*Z + 63)>>5;
// Set the offset to each bit within a group in terms of 32 Byte
// Set the offset to each bit within a group in terms of 64 Byte
bitOffsetInGroup = (lut_numCnInCnGroups_BG1_R13[6]*NR_LDPC_ZMAX)>>5;
......@@ -522,7 +522,7 @@ void nrLDPC_cnProc_BG1_generator(uint16_t Z,int R)
// for (i=0; i<M; i++,iprime++)
// {
fprintf(fd," for (int i=0;i<%d;i++) {\n",M);
// Abs and sign of 32 CNs (first BN)
// Abs and sign of 64 CNs (first BN)
// zmm0 = p_cnProcBuf[lut_idxCnProcG3[j][0] + i];
fprintf(fd," zmm0 = ((__m512i*)cnProcBuf)[%d+i];\n",(lut_startAddrCnGroups[6]>>5)+lut_idxCnProcG9[j][0]);
// sgn = _mm512_sign_epi16(ones, zmm0);
......@@ -569,11 +569,11 @@ void nrLDPC_cnProc_BG1_generator(uint16_t Z,int R)
if (lut_numCnInCnGroups[7] > 0)
{
// Number of groups of 32 CNs for parallel processing
// Number of groups of 64 CNs for parallel processing
// Ceil for values not divisible by 64
M = (lut_numCnInCnGroups[7]*Z + 63)>>5;
// Set the offset to each bit within a group in terms of 32 Byte
// Set the offset to each bit within a group in terms of 64 Byte
bitOffsetInGroup = (lut_numCnInCnGroups_BG1_R13[7]*NR_LDPC_ZMAX)>>5;
......@@ -592,7 +592,7 @@ void nrLDPC_cnProc_BG1_generator(uint16_t Z,int R)
// for (i=0; i<M; i++,iprime++)
// {
fprintf(fd," for (int i=0;i<%d;i++) {\n",M);
// Abs and sign of 32 CNs (first BN)
// Abs and sign of 64 CNs (first BN)
// zmm0 = p_cnProcBuf[lut_idxCnProcG3[j][0] + i];
fprintf(fd," zmm0 = ((__m512i*)cnProcBuf)[%d+i];\n",(lut_startAddrCnGroups[7]>>5)+lut_idxCnProcG10[j][0]);
// sgn = _mm512_sign_epi16(ones, zmm0);
......@@ -642,11 +642,11 @@ void nrLDPC_cnProc_BG1_generator(uint16_t Z,int R)
if (lut_numCnInCnGroups[8] > 0)
{
// Number of groups of 32 CNs for parallel processing
// Number of groups of 64 CNs for parallel processing
// Ceil for values not divisible by 64
M = (lut_numCnInCnGroups[8]*Z + 63)>>5;
// Set the offset to each bit within a group in terms of 32 Byte
// Set the offset to each bit within a group in terms of 64 Byte
bitOffsetInGroup = (lut_numCnInCnGroups_BG1_R13[8]*NR_LDPC_ZMAX)>>5;
......@@ -665,7 +665,7 @@ void nrLDPC_cnProc_BG1_generator(uint16_t Z,int R)
// for (i=0; i<M; i++,iprime++)
// {
fprintf(fd," for (int i=0;i<%d;i++) {\n",M);
// Abs and sign of 32 CNs (first BN)
// Abs and sign of 64 CNs (first BN)
// zmm0 = p_cnProcBuf[lut_idxCnProcG3[j][0] + i];
fprintf(fd," zmm0 = ((__m512i*)cnProcBuf)[%d+i];\n",(lut_startAddrCnGroups[8]>>5)+lut_idxCnProcG19[j][0]);
// sgn = _mm512_sign_epi16(ones, zmm0);
......@@ -699,3 +699,9 @@ void nrLDPC_cnProc_BG1_generator(uint16_t Z,int R)
fprintf(fd,"}\n");
fclose(fd);
}//end of the function nrLDPC_cnProc_BG1
#ifndef NRLDPC_CN_GEN
#define NRLDPC_CN_GEN
void nrLDPC_cnProc_BG1_generator_AVX512(uint16_t Z,int R);
#endif
#include <stdio.h>
#include <immintrin.h>
//#include "../nrLDPCdecoder_defs.h"
#include "../../nrLDPC_types.h"
#include "../../nrLDPC_init.h"
#include "../../nrLDPC_bnProc.h"
#include "cnProc_gen_avx512.h"
int main(int argc, char *argv [])
{
// Z=384, R=1/3
nrLDPC_cnProc_BG1_generator_AVX512(384,0);
return(0);
}
#ifndef NR_CN_PROC_BG1_OPTIM
#define NR_CN_PROC_BG1_OPTIM
void nrLDPC_cnProc_BG1_Z384_13(int8_t* cnProcBuf,int8_t* cnProcBufRes);
void nrLDPC_cnProc_BG1_Z384_13_AVX512(int8_t* cnProcBuf,int8_t* cnProcBufRes);
void nrLDPC_cnProc_BG1_Z384_13_AVX2(int8_t* cnProcBuf,int8_t* cnProcBufRes);
#endif
#include <stdio.h>
#include <immintrin.h>
//#include "../nrLDPCdecoder_defs.h"
#include "../nrLDPC_types.h"
#include "../nrLDPC_init.h"
//#include "../nrLDPC_mPass.h"
//#include "../nrLDPC_cnProc.h"
#include "../nrLDPC_bnProc.h"
#include "cnProc_gen.h"
int main(int argc, char *argv [])
{
//short lift_size[51]= {2,3,4,5,6,7,8,9,10,11,12,13,14,15,16,18,20,22,24,26,28,30,32,36,40,44,48,52,56,60,64,72,80,88,96,104,112,120,128,144,160,176,192,208,224,240,256,288,320,352,384};
// unsigned int errors, errors_bit, crc_misses;
// double errors_bit_uncoded;
//short block_length=8448; // decoder supports length: 1201 -> 1280, 2401 -> 2560
// short No_iteration=5;
//int n_segments=1;
//double rate=0.333;
//int nom_rate=1;
//int denom_rate=3;
//double SNR0=-2.0,SNR,SNR_lin;
//unsigned char qbits=8;
// unsigned int decoded_errors[10000]; // initiate the size of matrix equivalent to size of SNR
//int c,i=0, i1 = 0;
// int n_trials = 1;
// double SNR_step = 0.1;
// randominit(0);
//int test_uncoded= 0;
//short BG=1,Zc,Kb;
// cpu_freq_GHz = get_cpu_freq_GHz();
//printf("the decoder supports BG2, Kb=10, Z=128 & 256\n");
//printf(" range of blocklength: 1201 -> 1280, 2401 -> 2560\n");
// printf("block length %d: \n", block_length);
//printf("n_trials %d: \n", n_trials);
// printf("SNR0 %f: \n", SNR0);
//find minimum value in all sets of lifting size
/* Zc=0;
for (i1=0; i1 < 51; i1++)
{
if (lift_size[i1] >= (double) block_length/Kb)
{
Zc = lift_size[i1];
//printf("%d\n",Zc);
break;
}
}*/
// Allocate LDPC decoder buffers
// p_nrLDPC_procBuf = nrLDPC_init_mem();
// load_nrLDPClib();
// load_nrLDPClib_ref("_orig", &encoder_orig);
// Z=384, R=1/3
nrLDPC_cnProc_BG1_generator(384,0);
//nrLDPC_cnProc_BG1(&lut_numCnInCnGroups, &cnProcBuf, 380);
//for (block_length=8;block_length<=MAX_BLOCK_LENGTH;block_length+=8)
//determine number of bits in codeword
/*
char fname[200];
sprintf(fname,"cnProc_BG1_Zc_%d.c",384);
FILE *fd=fopen(fname,"w");
// AssertFatal(fd!=NULL,"cannot open %s\n",fname);
*/
//fclose(fd);
return(0);
}
Markdown is supported
0%
or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment