Commit 845a2264 authored by Sy's avatar Sy

optimized 5G NR LDPC decoder

parent c8a787d2
/* /*
* Licensed to the OpenAirInterface (OAI) Software Alliance under one or more * Licensed to the OpenAirInterface (OAI) Software Alliance under one or more
* contributor license agreements. See the NOTICE file distributed with * contributor license agreements. See the NOTICE file distributed with
...@@ -328,17 +329,17 @@ static inline uint32_t nrLDPC_decoder_core(int8_t* p_llr, int8_t* p_out, t_nrLDP ...@@ -328,17 +329,17 @@ static inline uint32_t nrLDPC_decoder_core(int8_t* p_llr, int8_t* p_out, t_nrLDP
{ {
case 13: case 13:
{ {
nrLDPC_bnProcPc_BG1_R13_AVX2(p_procBuf->bnProcBuf,p_procBuf->llrRes, p_procBuf->llrProcBuf, Z); nrLDPC_bnProcPc_BG1_R13_AVX2(p_procBuf->bnProcBuf,p_procBuf->bnProcBufRes,p_procBuf->llrRes, p_procBuf->llrProcBuf, Z);
break; break;
} }
case 23: case 23:
{ {
nrLDPC_bnProcPc_BG1_R23_AVX2(p_procBuf->bnProcBuf,p_procBuf->llrRes, p_procBuf->llrProcBuf, Z); nrLDPC_bnProcPc_BG1_R23_AVX2(p_procBuf->bnProcBuf,p_procBuf->bnProcBufRes, p_procBuf->llrRes, p_procBuf->llrProcBuf, Z);
break; break;
} }
case 89: case 89:
{ {
nrLDPC_bnProcPc_BG1_R89_AVX2(p_procBuf->bnProcBuf,p_procBuf->llrRes, p_procBuf->llrProcBuf, Z); nrLDPC_bnProcPc_BG1_R89_AVX2(p_procBuf->bnProcBuf,p_procBuf->bnProcBufRes, p_procBuf->llrRes, p_procBuf->llrProcBuf, Z);
break; break;
} }
} }
...@@ -349,20 +350,20 @@ static inline uint32_t nrLDPC_decoder_core(int8_t* p_llr, int8_t* p_out, t_nrLDP ...@@ -349,20 +350,20 @@ static inline uint32_t nrLDPC_decoder_core(int8_t* p_llr, int8_t* p_out, t_nrLDP
{ {
case 15: case 15:
{ {
nrLDPC_bnProcPc_BG2_R15_AVX2(p_procBuf->bnProcBuf,p_procBuf->llrRes, p_procBuf->llrProcBuf, Z); nrLDPC_bnProcPc_BG2_R15_AVX2(p_procBuf->bnProcBuf,p_procBuf->bnProcBufRes, p_procBuf->llrRes, p_procBuf->llrProcBuf, Z);
break; break;
} }
case 13: case 13:
{ {
nrLDPC_bnProcPc_BG2_R13_AVX2(p_procBuf->bnProcBuf,p_procBuf->llrRes, p_procBuf->llrProcBuf, Z); nrLDPC_bnProcPc_BG2_R13_AVX2(p_procBuf->bnProcBuf,p_procBuf->bnProcBufRes,p_procBuf->llrRes, p_procBuf->llrProcBuf, Z);
break; break;
} }
case 23: case 23:
{ {
nrLDPC_bnProcPc_BG2_R23_AVX2(p_procBuf->bnProcBuf,p_procBuf->llrRes, p_procBuf->llrProcBuf, Z); nrLDPC_bnProcPc_BG2_R23_AVX2(p_procBuf->bnProcBuf,p_procBuf->bnProcBufRes,p_procBuf->llrRes, p_procBuf->llrProcBuf, Z);
break; break;
} }
...@@ -614,17 +615,17 @@ if (BG==1) ...@@ -614,17 +615,17 @@ if (BG==1)
{ {
case 13: case 13:
{ {
nrLDPC_bnProcPc_BG1_R13_AVX2(p_procBuf->bnProcBuf,p_procBuf->llrRes, p_procBuf->llrProcBuf, Z); nrLDPC_bnProcPc_BG1_R13_AVX2(p_procBuf->bnProcBuf,p_procBuf->bnProcBufRes,p_procBuf->llrRes, p_procBuf->llrProcBuf, Z);
break; break;
} }
case 23: case 23:
{ {
nrLDPC_bnProcPc_BG1_R23_AVX2(p_procBuf->bnProcBuf,p_procBuf->llrRes, p_procBuf->llrProcBuf, Z); nrLDPC_bnProcPc_BG1_R23_AVX2(p_procBuf->bnProcBuf,p_procBuf->bnProcBufRes, p_procBuf->llrRes, p_procBuf->llrProcBuf, Z);
break; break;
} }
case 89: case 89:
{ {
nrLDPC_bnProcPc_BG1_R89_AVX2(p_procBuf->bnProcBuf,p_procBuf->llrRes, p_procBuf->llrProcBuf, Z); nrLDPC_bnProcPc_BG1_R89_AVX2(p_procBuf->bnProcBuf,p_procBuf->bnProcBufRes, p_procBuf->llrRes, p_procBuf->llrProcBuf, Z);
break; break;
} }
} }
...@@ -635,20 +636,20 @@ if (BG==1) ...@@ -635,20 +636,20 @@ if (BG==1)
{ {
case 15: case 15:
{ {
nrLDPC_bnProcPc_BG2_R15_AVX2(p_procBuf->bnProcBuf,p_procBuf->llrRes, p_procBuf->llrProcBuf, Z); nrLDPC_bnProcPc_BG2_R15_AVX2(p_procBuf->bnProcBuf,p_procBuf->bnProcBufRes,p_procBuf->llrRes, p_procBuf->llrProcBuf, Z);
break; break;
} }
case 13: case 13:
{ {
nrLDPC_bnProcPc_BG2_R13_AVX2(p_procBuf->bnProcBuf,p_procBuf->llrRes, p_procBuf->llrProcBuf, Z); nrLDPC_bnProcPc_BG2_R13_AVX2(p_procBuf->bnProcBuf,p_procBuf->bnProcBufRes,p_procBuf->llrRes, p_procBuf->llrProcBuf, Z);
break; break;
} }
case 23: case 23:
{ {
nrLDPC_bnProcPc_BG2_R23_AVX2(p_procBuf->bnProcBuf,p_procBuf->llrRes, p_procBuf->llrProcBuf, Z); nrLDPC_bnProcPc_BG2_R23_AVX2(p_procBuf->bnProcBuf,p_procBuf->bnProcBufRes,p_procBuf->llrRes, p_procBuf->llrProcBuf, Z);
break; break;
} }
...@@ -723,7 +724,7 @@ if (BG==1) ...@@ -723,7 +724,7 @@ if (BG==1)
#ifdef __AVX512BW__ #ifdef __AVX512BW__
nrLDPC_bnProc_BG2_R13_AVX512(p_procBuf->bnProcBuf, p_procBuf->bnProcBufRes,p_procBuf->llrRes, Z); nrLDPC_bnProc_BG2_R13_AVX512(p_procBuf->bnProcBuf, p_procBuf->bnProcBufRes,p_procBuf->llrRes, Z);
#else #else
nrLDPC_bnProc_BG2_R13_AVX2(p_procBuf->bnProcBuf, p_procBuf->bnProcBufRes,p_procBuf->llrRes, Z); nrLDPC_bnProc_BG2_R13_AVX2(p_procBuf->bnProcBuf, p_procBuf->bnProcBufRes,+p_procBuf->llrRes, Z);
#endif #endif
break; break;
} }
...@@ -906,7 +907,7 @@ if (BG==1) ...@@ -906,7 +907,7 @@ if (BG==1)
#ifdef NR_LDPC_PROFILER_DETAIL #ifdef NR_LDPC_PROFILER_DETAIL
start_meas(&p_profiler->bnProcPc); start_meas(&p_profiler->bnProcPc);
#endif #endif
//nrLDPC_bnProcPc(p_lut, p_procBuf, Z); // nrLDPC_bnProcPc(p_lut, p_procBuf, Z);
if (BG==1) if (BG==1)
{ {
...@@ -914,17 +915,17 @@ if (BG==1) ...@@ -914,17 +915,17 @@ if (BG==1)
{ {
case 13: case 13:
{ {
nrLDPC_bnProcPc_BG1_R13_AVX2(p_procBuf->bnProcBuf,p_procBuf->llrRes, p_procBuf->llrProcBuf, Z); nrLDPC_bnProcPc_BG1_R13_AVX2(p_procBuf->bnProcBuf,p_procBuf->bnProcBufRes,p_procBuf->llrRes, p_procBuf->llrProcBuf, Z);
break; break;
} }
case 23: case 23:
{ {
nrLDPC_bnProcPc_BG1_R23_AVX2(p_procBuf->bnProcBuf,p_procBuf->llrRes, p_procBuf->llrProcBuf, Z); nrLDPC_bnProcPc_BG1_R23_AVX2(p_procBuf->bnProcBuf,p_procBuf->bnProcBufRes, p_procBuf->llrRes, p_procBuf->llrProcBuf, Z);
break; break;
} }
case 89: case 89:
{ {
nrLDPC_bnProcPc_BG1_R89_AVX2(p_procBuf->bnProcBuf,p_procBuf->llrRes, p_procBuf->llrProcBuf, Z); nrLDPC_bnProcPc_BG1_R89_AVX2(p_procBuf->bnProcBuf,p_procBuf->bnProcBufRes, p_procBuf->llrRes, p_procBuf->llrProcBuf, Z);
break; break;
} }
} }
...@@ -935,20 +936,20 @@ if (BG==1) ...@@ -935,20 +936,20 @@ if (BG==1)
{ {
case 15: case 15:
{ {
nrLDPC_bnProcPc_BG2_R15_AVX2(p_procBuf->bnProcBuf, p_procBuf->llrRes, p_procBuf->llrProcBuf, Z); nrLDPC_bnProcPc_BG2_R15_AVX2(p_procBuf->bnProcBuf,p_procBuf->bnProcBufRes, p_procBuf->llrRes, p_procBuf->llrProcBuf, Z);
break; break;
} }
case 13: case 13:
{ {
nrLDPC_bnProcPc_BG2_R13_AVX2(p_procBuf->bnProcBuf,p_procBuf->llrRes, p_procBuf->llrProcBuf, Z); nrLDPC_bnProcPc_BG2_R13_AVX2(p_procBuf->bnProcBuf,p_procBuf->bnProcBufRes,p_procBuf->llrRes, p_procBuf->llrProcBuf, Z);
break; break;
} }
case 23: case 23:
{ {
nrLDPC_bnProcPc_BG2_R23_AVX2(p_procBuf->bnProcBuf,p_procBuf->llrRes, p_procBuf->llrProcBuf, Z); nrLDPC_bnProcPc_BG2_R23_AVX2(p_procBuf->bnProcBuf,p_procBuf->bnProcBufRes,p_procBuf->llrRes, p_procBuf->llrProcBuf, Z);
break; break;
} }
...@@ -1136,3 +1137,4 @@ if (BG==1) ...@@ -1136,3 +1137,4 @@ if (BG==1)
#include <stdint.h> #include <stdint.h>
#include <immintrin.h> #include <immintrin.h>
#include "../../nrLDPCdecoder_defs.h" #include "../../nrLDPCdecoder_defs.h"
...@@ -24,7 +25,7 @@ void nrLDPC_bnProcPc_BG1_generator_AVX2(int R) ...@@ -24,7 +25,7 @@ void nrLDPC_bnProcPc_BG1_generator_AVX2(int R)
// fprintf(fd,"#include <stdint.h>\n"); // fprintf(fd,"#include <stdint.h>\n");
// fprintf(fd,"#include <immintrin.h>\n"); // fprintf(fd,"#include <immintrin.h>\n");
fprintf(fd,"static inline void nrLDPC_bnProcPc_BG1_R%s_AVX2(int8_t* bnProcBuf,int8_t* llrRes , int8_t* llrProcBuf, uint16_t Z ) {\n",ratestr[R]); fprintf(fd,"static inline void nrLDPC_bnProcPc_BG1_R%s_AVX2(int8_t* bnProcBuf,int8_t* bnProcBufRes,int8_t* llrRes , int8_t* llrProcBuf, uint16_t Z ) {\n",ratestr[R]);
const uint8_t* lut_numBnInBnGroups; const uint8_t* lut_numBnInBnGroups;
const uint32_t* lut_startAddrBnGroups; const uint32_t* lut_startAddrBnGroups;
const uint16_t* lut_startAddrBnGroupsLlr; const uint16_t* lut_startAddrBnGroupsLlr;
...@@ -64,16 +65,18 @@ void nrLDPC_bnProcPc_BG1_generator_AVX2(int R) ...@@ -64,16 +65,18 @@ void nrLDPC_bnProcPc_BG1_generator_AVX2(int R)
fprintf(fd," __m128i* p_bnProcBuf; \n"); fprintf(fd," __m128i* p_bnProcBuf; \n");
fprintf(fd," __m128i* p_llrProcBuf;\n"); fprintf(fd," __m128i* p_llrProcBuf;\n");
fprintf(fd," __m256i* p_llrRes; \n"); fprintf(fd," __m256i* p_llrRes; \n");
// fprintf(fd," __m256i* p_bnProcBufRes; \n");
// fprintf(fd," __m256i* p_llrProcBuf256; \n");
fprintf(fd," uint32_t M ;\n"); fprintf(fd," uint32_t M ;\n");
fprintf(fd, "// Process group with 1 CNs \n"); fprintf(fd, "// Process group with 1 CNs \n");
// Process group with 2 CNs
/* /*
if (lut_numBnInBnGroups[0] > 0) // Process group with 1 CNs
{
// if (lut_numBnInBnGroups[0] > 0)
// {
// If elements in group move to next address // If elements in group move to next address
// idxBnGroup++; // idxBnGroup++;
...@@ -81,36 +84,30 @@ fprintf(fd, "// Process group with 1 CNs \n"); ...@@ -81,36 +84,30 @@ fprintf(fd, "// Process group with 1 CNs \n");
fprintf(fd," M = (%d*Z + 31)>>5;\n",lut_numBnInBnGroups[0] ); fprintf(fd," M = (%d*Z + 31)>>5;\n",lut_numBnInBnGroups[0] );
// Set the offset to each CN within a group in terms of 16 Byte // Set the offset to each CN within a group in terms of 16 Byte
cnOffsetInGroup = (lut_numBnInBnGroups[0]*NR_LDPC_ZMAX)>>4; // cnOffsetInGroup = (lut_numBnInBnGroups[0]*NR_LDPC_ZMAX)>>4;
// Set pointers to start of group 2 // Set pointers to start of group 2
fprintf(fd," p_bnProcBuf = (__m128i*) &bnProcBuf [%d];\n",lut_startAddrBnGroups[idxBnGroup]); fprintf(fd," p_bnProcBuf = (__m128i*) &bnProcBuf [%d];\n",lut_startAddrBnGroups[idxBnGroup]);
// fprintf(fd," p_bnProcBufRes = (__m256i*) &bnProcBufRes [%d];\n",lut_startAddrBnGroups[idxBnGroup]);
fprintf(fd," p_llrProcBuf = (__m128i*) &llrProcBuf [%d];\n",lut_startAddrBnGroupsLlr[idxBnGroup]); fprintf(fd," p_llrProcBuf = (__m128i*) &llrProcBuf [%d];\n",lut_startAddrBnGroupsLlr[idxBnGroup]);
// fprintf(fd," p_llrProcBuf256 = (__m256i*) &llrProcBuf [%d];\n",lut_startAddrBnGroupsLlr[idxBnGroup]);
fprintf(fd," p_llrRes = (__m256i*) &llrRes [%d];\n",lut_startAddrBnGroupsLlr[idxBnGroup]); fprintf(fd," p_llrRes = (__m256i*) &llrRes [%d];\n",lut_startAddrBnGroupsLlr[idxBnGroup]);
// Loop over BNs // Loop over BNs
fprintf(fd," for (int i=0,j=0;i<M;i++,j+=2) {\n"); fprintf(fd," for (int i=0,j=0;i<M;i++,j+=2) {\n");
// First 16 LLRs of first CN
fprintf(fd," ymm0 = _mm256_cvtepi8_epi16(p_bnProcBuf [j]);\n");
fprintf(fd," ymm1 = _mm256_cvtepi8_epi16(p_bnProcBuf[j + 1]);\n");
// Loop over CNs fprintf(fd," p_bnProcBufRes[i] = p_llrProcBuf256[i];\n");
for (k=1; k<1; k++)
{
fprintf(fd," ymm0 = _mm256_cvtepi8_epi16(p_bnProcBuf[%d + j]);\n", k*cnOffsetInGroup);
fprintf(fd," ymmRes0 = _mm256_adds_epi16(ymmRes0, ymm0);\n");
fprintf(fd," ymm1 = _mm256_cvtepi8_epi16(p_bnProcBuf[%d + j +1]);\n", k*cnOffsetInGroup); // First 16 LLRs of first CN
fprintf(fd," ymm0 = _mm256_cvtepi8_epi16(p_bnProcBuf [j]);\n");
fprintf(fd, " ymmRes1 = _mm256_adds_epi16(ymmRes1, ymm1); \n"); fprintf(fd," ymm1 = _mm256_cvtepi8_epi16(p_llrProcBuf[j]);\n");
} fprintf(fd," ymmRes0 = _mm256_adds_epi16(ymm0, ymm1);\n");
// Add LLR from receiver input
fprintf(fd," ymm0 = _mm256_cvtepi8_epi16(p_llrProcBuf[j]);\n");
fprintf(fd," ymmRes0 = _mm256_adds_epi16(ymmRes0, ymm0);\n");
fprintf(fd," ymm1 = _mm256_cvtepi8_epi16(p_llrProcBuf[j +1 ]);\n"); // Second 16 LLRs of first CN
fprintf(fd," ymmRes1 = _mm256_adds_epi16(ymmRes1, ymm1);\n"); fprintf(fd," ymm0 = _mm256_cvtepi8_epi16(p_bnProcBuf[j + 1 ]);\n");
fprintf(fd," ymm1 = _mm256_cvtepi8_epi16(p_llrProcBuf[j + 1 ]);\n");
fprintf(fd," ymmRes1 = _mm256_adds_epi16(ymm0, ymm1);\n");
// Pack results back to epi8 // Pack results back to epi8
fprintf(fd," ymm0 = _mm256_packs_epi16(ymmRes0, ymmRes1);\n"); fprintf(fd," ymm0 = _mm256_packs_epi16(ymmRes0, ymmRes1);\n");
...@@ -120,10 +117,11 @@ fprintf(fd, "// Process group with 1 CNs \n"); ...@@ -120,10 +117,11 @@ fprintf(fd, "// Process group with 1 CNs \n");
fprintf(fd,"}\n"); fprintf(fd,"}\n");
} //}
*/
// ===================================================================== // =====================================================================
// Process group with 2 CNs // Process group with 2 CNs
*/
fprintf(fd, "// Process group with 2 CNs \n"); fprintf(fd, "// Process group with 2 CNs \n");
...@@ -148,8 +146,8 @@ fprintf(fd, "// Process group with 2 CNs \n"); ...@@ -148,8 +146,8 @@ fprintf(fd, "// Process group with 2 CNs \n");
// Loop over BNs // Loop over BNs
fprintf(fd," for (int i=0,j=0;i<M;i++,j+=2) {\n"); fprintf(fd," for (int i=0,j=0;i<M;i++,j+=2) {\n");
// First 16 LLRs of first CN // First 16 LLRs of first CN
fprintf(fd," ymm0 = _mm256_cvtepi8_epi16(p_bnProcBuf [j]);\n"); fprintf(fd," ymmRes0 = _mm256_cvtepi8_epi16(p_bnProcBuf [j]);\n");
fprintf(fd," ymm1 = _mm256_cvtepi8_epi16(p_bnProcBuf[j + 1]);\n"); fprintf(fd," ymmRes1 = _mm256_cvtepi8_epi16(p_bnProcBuf[j + 1]);\n");
// Loop over CNs // Loop over CNs
for (k=1; k<2; k++) for (k=1; k<2; k++)
...@@ -1767,3 +1765,4 @@ fprintf(fd, "// Process group with 30 CNs \n"); ...@@ -1767,3 +1765,4 @@ fprintf(fd, "// Process group with 30 CNs \n");
#include <stdint.h> #include <stdint.h>
#include <immintrin.h> #include <immintrin.h>
#include "../../nrLDPCdecoder_defs.h" #include "../../nrLDPCdecoder_defs.h"
...@@ -25,7 +21,7 @@ void nrLDPC_bnProcPc_BG2_generator_AVX2(int R) ...@@ -25,7 +21,7 @@ void nrLDPC_bnProcPc_BG2_generator_AVX2(int R)
// fprintf(fd,"#include <stdint.h>\n"); // fprintf(fd,"#include <stdint.h>\n");
//fprintf(fd,"#include <immintrin.h>\n"); //fprintf(fd,"#include <immintrin.h>\n");
fprintf(fd,"static inline void nrLDPC_bnProcPc_BG2_R%s_AVX2(int8_t* bnProcBuf,int8_t* llrRes , int8_t* llrProcBuf, uint16_t Z ) {\n",ratestr[R]); fprintf(fd,"static inline void nrLDPC_bnProcPc_BG2_R%s_AVX2(int8_t* bnProcBuf,int8_t* bnProcBufRes,int8_t* llrRes , int8_t* llrProcBuf, uint16_t Z ) {\n",ratestr[R]);
const uint8_t* lut_numBnInBnGroups; const uint8_t* lut_numBnInBnGroups;
const uint32_t* lut_startAddrBnGroups; const uint32_t* lut_startAddrBnGroups;
const uint16_t* lut_startAddrBnGroupsLlr; const uint16_t* lut_startAddrBnGroupsLlr;
...@@ -51,10 +47,9 @@ void nrLDPC_bnProcPc_BG2_generator_AVX2(int R) ...@@ -51,10 +47,9 @@ void nrLDPC_bnProcPc_BG2_generator_AVX2(int R)
} }
else { printf("aborting, illegal R %d\n",R); fclose(fd);abort();} else { printf("aborting, illegal R %d\n",R); fclose(fd);abort();}
// Number of BNs in Groups
// uint32_t M;
//uint32_t M32rem;
//uint32_t i,j;
uint32_t k; uint32_t k;
// Offset to each bit within a group in terms of 32 Byte // Offset to each bit within a group in terms of 32 Byte
uint32_t cnOffsetInGroup; uint32_t cnOffsetInGroup;
...@@ -66,16 +61,18 @@ void nrLDPC_bnProcPc_BG2_generator_AVX2(int R) ...@@ -66,16 +61,18 @@ void nrLDPC_bnProcPc_BG2_generator_AVX2(int R)
fprintf(fd," __m128i* p_bnProcBuf; \n"); fprintf(fd," __m128i* p_bnProcBuf; \n");
fprintf(fd," __m128i* p_llrProcBuf;\n"); fprintf(fd," __m128i* p_llrProcBuf;\n");
fprintf(fd," __m256i* p_llrRes; \n"); fprintf(fd," __m256i* p_llrRes; \n");
// fprintf(fd," __m256i* p_bnProcBufRes; \n");
// fprintf(fd," __m256i* p_llrProcBuf256; \n");
fprintf(fd," uint32_t M ;\n"); fprintf(fd," uint32_t M ;\n");
fprintf(fd, "// Process group with 1 CNs \n"); fprintf(fd, "// Process group with 1 CNs \n");
/*
// Process group with 1 CNs
// Process group with 2 CNs // if (lut_numBnInBnGroups[0] > 0)
// {
if (lut_numBnInBnGroups[0] > 0)
{
// If elements in group move to next address // If elements in group move to next address
// idxBnGroup++; // idxBnGroup++;
...@@ -83,36 +80,30 @@ fprintf(fd, "// Process group with 1 CNs \n"); ...@@ -83,36 +80,30 @@ fprintf(fd, "// Process group with 1 CNs \n");
fprintf(fd," M = (%d*Z + 31)>>5;\n",lut_numBnInBnGroups[0] ); fprintf(fd," M = (%d*Z + 31)>>5;\n",lut_numBnInBnGroups[0] );
// Set the offset to each CN within a group in terms of 16 Byte // Set the offset to each CN within a group in terms of 16 Byte
cnOffsetInGroup = (lut_numBnInBnGroups[0]*NR_LDPC_ZMAX)>>4; // cnOffsetInGroup = (lut_numBnInBnGroups[0]*NR_LDPC_ZMAX)>>4;
// Set pointers to start of group 2 // Set pointers to start of group 2
fprintf(fd," p_bnProcBuf = (__m128i*) &bnProcBuf [%d];\n",lut_startAddrBnGroups[idxBnGroup]); fprintf(fd," p_bnProcBuf = (__m128i*) &bnProcBuf [%d];\n",lut_startAddrBnGroups[idxBnGroup]);
fprintf(fd," p_bnProcBufRes = (__m256i*) &bnProcBufRes [%d];\n",lut_startAddrBnGroups[idxBnGroup]);
fprintf(fd," p_llrProcBuf = (__m128i*) &llrProcBuf [%d];\n",lut_startAddrBnGroupsLlr[idxBnGroup]); fprintf(fd," p_llrProcBuf = (__m128i*) &llrProcBuf [%d];\n",lut_startAddrBnGroupsLlr[idxBnGroup]);
fprintf(fd," p_llrProcBuf256 = (__m256i*) &llrProcBuf [%d];\n",lut_startAddrBnGroupsLlr[idxBnGroup]);
fprintf(fd," p_llrRes = (__m256i*) &llrRes [%d];\n",lut_startAddrBnGroupsLlr[idxBnGroup]); fprintf(fd," p_llrRes = (__m256i*) &llrRes [%d];\n",lut_startAddrBnGroupsLlr[idxBnGroup]);
// Loop over BNs // Loop over BNs
fprintf(fd," for (int i=0,j=0;i<M;i++,j+=2) {\n"); fprintf(fd," for (int i=0,j=0;i<M;i++,j+=2) {\n");
fprintf(fd," p_bnProcBufRes[i] = p_llrProcBuf256[i];\n");
// First 16 LLRs of first CN // First 16 LLRs of first CN
fprintf(fd," ymm0 = _mm256_cvtepi8_epi16(p_bnProcBuf [j]);\n"); fprintf(fd," ymm0 = _mm256_cvtepi8_epi16(p_bnProcBuf [j]);\n");
fprintf(fd," ymm1 = _mm256_cvtepi8_epi16(p_llrProcBuf[j]);\n"); fprintf(fd," ymm1 = _mm256_cvtepi8_epi16(p_llrProcBuf[j]);\n");
fprintf(fd," ymmRes0 = _mm256_adds_epi16(ymm0, ymm1);\n");
// Loop over CNs
/*for (k=1; k<1; k++)
{
fprintf(fd," ymm0 = _mm256_cvtepi8_epi16(p_bnProcBuf[%d + j]);\n", k*cnOffsetInGroup);
fprintf(fd," ymmRes0 = _mm256_adds_epi16(ymmRes0, ymm0);\n");
fprintf(fd," ymm1 = _mm256_cvtepi8_epi16(p_bnProcBuf[%d + j +1]);\n", k*cnOffsetInGroup);
fprintf(fd, " ymmRes1 = _mm256_adds_epi16(ymmRes1, ymm1); \n"); // Second 16 LLRs of first CN
} fprintf(fd," ymm0 = _mm256_cvtepi8_epi16(p_bnProcBuf[j + 1 ]);\n");
*/ fprintf(fd," ymm1 = _mm256_cvtepi8_epi16(p_llrProcBuf[j + 1 ]);\n");
// Add LLR from receiver input fprintf(fd," ymmRes1 = _mm256_adds_epi16(ymm0, ymm1);\n");
fprintf(fd," ymm0 = _mm256_cvtepi8_epi16(p_bnProcBuf[j+1]);\n");
fprintf(fd," ymmRes0 = _mm256_adds_epi16(ymmRes0, ymm0);\n");
fprintf(fd," ymm1 = _mm256_cvtepi8_epi16(p_llrProcBuf[j +1 ]);\n");
fprintf(fd," ymmRes1 = _mm256_adds_epi16(ymmRes1, ymm1);\n");
// Pack results back to epi8 // Pack results back to epi8
fprintf(fd," ymm0 = _mm256_packs_epi16(ymmRes0, ymmRes1);\n"); fprintf(fd," ymm0 = _mm256_packs_epi16(ymmRes0, ymmRes1);\n");
...@@ -122,8 +113,8 @@ fprintf(fd, "// Process group with 1 CNs \n"); ...@@ -122,8 +113,8 @@ fprintf(fd, "// Process group with 1 CNs \n");
fprintf(fd,"}\n"); fprintf(fd,"}\n");
} //}
// ===================================================================== */ // =====================================================================
// Process group with 2 CNs // Process group with 2 CNs
...@@ -150,8 +141,8 @@ fprintf(fd, "// Process group with 2 CNs \n"); ...@@ -150,8 +141,8 @@ fprintf(fd, "// Process group with 2 CNs \n");
// Loop over BNs // Loop over BNs
fprintf(fd," for (int i=0,j=0;i<M;i++,j+=2) {\n"); fprintf(fd," for (int i=0,j=0;i<M;i++,j+=2) {\n");
// First 16 LLRs of first CN // First 16 LLRs of first CN
fprintf(fd," ymm0 = _mm256_cvtepi8_epi16(p_bnProcBuf [j]);\n"); fprintf(fd," ymmRes0 = _mm256_cvtepi8_epi16(p_bnProcBuf [j]);\n");
fprintf(fd," ymm1 = _mm256_cvtepi8_epi16(p_bnProcBuf[j + 1]);\n"); fprintf(fd," ymmRes1 = _mm256_cvtepi8_epi16(p_bnProcBuf[j + 1]);\n");
// Loop over CNs // Loop over CNs
for (k=1; k<2; k++) for (k=1; k<2; k++)
...@@ -1771,4 +1762,3 @@ fprintf(fd, "// Process group with 30 CNs \n"); ...@@ -1771,4 +1762,3 @@ fprintf(fd, "// Process group with 30 CNs \n");
#include <stdint.h> #include <stdint.h>
#include <immintrin.h> #include <immintrin.h>
#include "../../nrLDPCdecoder_defs.h" #include "../../nrLDPCdecoder_defs.h"
...@@ -148,8 +144,8 @@ fprintf(fd, "// Process group with 2 CNs \n"); ...@@ -148,8 +144,8 @@ fprintf(fd, "// Process group with 2 CNs \n");
// Loop over BNs // Loop over BNs
fprintf(fd," for (int i=0,j=0;i<M;i++,j+=2) {\n"); fprintf(fd," for (int i=0,j=0;i<M;i++,j+=2) {\n");
// First 16 LLRs of first CN // First 16 LLRs of first CN
fprintf(fd," zmm0 = _mm512_cvtepi8_epi16(p_bnProcBuf [j]);\n"); fprintf(fd," zmmRes0 = _mm512_cvtepi8_epi16(p_bnProcBuf [j]);\n");
fprintf(fd," zmm1 = _mm512_cvtepi8_epi16(p_bnProcBuf[j + 1]);\n"); fprintf(fd," zmmRes1 = _mm512_cvtepi8_epi16(p_bnProcBuf[j + 1]);\n");
// Loop over CNs // Loop over CNs
for (k=1; k<2; k++) for (k=1; k<2; k++)
......
...@@ -146,8 +146,8 @@ fprintf(fd, "// Process group with 2 CNs \n"); ...@@ -146,8 +146,8 @@ fprintf(fd, "// Process group with 2 CNs \n");
// Loop over BNs // Loop over BNs
fprintf(fd," for (int i=0,j=0;i<M;i++,j+=2) {\n"); fprintf(fd," for (int i=0,j=0;i<M;i++,j+=2) {\n");
// First 16 LLRs of first CN // First 16 LLRs of first CN
fprintf(fd," zmm0 = _mm512_cvtepi8_epi16(p_bnProcBuf [j]);\n"); fprintf(fd," zmmRes0 = _mm512_cvtepi8_epi16(p_bnProcBuf [j]);\n");
fprintf(fd," zmm1 = _mm512_cvtepi8_epi16(p_bnProcBuf[j + 1]);\n"); fprintf(fd," zmmRes1 = _mm512_cvtepi8_epi16(p_bnProcBuf[j + 1]);\n");
// Loop over CNs // Loop over CNs
for (k=1; k<2; k++) for (k=1; k<2; k++)
......
static inline void nrLDPC_bnProc_BG1_R89_AVX2(int8_t* bnProcBuf,int8_t* bnProcBufRes, int8_t* llrRes, uint16_t Z ) { static inline void nrLDPC_bnProc_BG1_R89_AVX2(int8_t* bnProcBuf,int8_t* bnProcBufRes, int8_t* llrRes, uint16_t Z ) {
__m256i* p_bnProcBuf;
__m256i* p_bnProcBufRes;
__m256i* p_llrRes;
__m256i* p_res;
uint32_t M, i; uint32_t M, i;
// Process group with 2 CNs // Process group with 2 CNs
M = (3*Z + 31)>>5; M = (3*Z + 31)>>5;
p_bnProcBuf = (__m256i*) &bnProcBuf [384];
p_bnProcBufRes = (__m256i*) &bnProcBufRes [384];
p_res = &p_bnProcBufRes[0];
p_llrRes = (__m256i*) &llrRes [384];
for (i=0;i<M;i++) { for (i=0;i<M;i++) {
p_res[i] = _mm256_subs_epi8(p_llrRes[i], p_bnProcBuf[0 + i]); ((__m256i*)bnProcBufRes)[12 + i ] = _mm256_subs_epi8(((__m256i*)llrRes)[12 + i ], ((__m256i*) bnProcBuf)[12 + i]);
} }
p_res = &p_bnProcBufRes[36];
p_llrRes = (__m256i*) &llrRes [384];
for (i=0;i<M;i++) { for (i=0;i<M;i++) {
p_res[i] = _mm256_subs_epi8(p_llrRes[i], p_bnProcBuf[36 + i]); ((__m256i*)bnProcBufRes)[48 + i ] = _mm256_subs_epi8(((__m256i*)llrRes)[12 + i ], ((__m256i*) bnProcBuf)[48 + i]);
} }
// Process group with 3 CNs // Process group with 3 CNs
M = (21*Z + 31)>>5; M = (21*Z + 31)>>5;
p_bnProcBuf = (__m256i*) &bnProcBuf [2688];
p_bnProcBufRes = (__m256i*) &bnProcBufRes [2688];
p_res = &p_bnProcBufRes[0];
p_llrRes = (__m256i*) &llrRes [1536];
for (i=0;i<M;i++) { for (i=0;i<M;i++) {
p_res[i] = _mm256_subs_epi8(p_llrRes[i], p_bnProcBuf[0 + i]); ((__m256i*)bnProcBufRes)[84 + i ] = _mm256_subs_epi8(((__m256i*)llrRes)[48 + i ], ((__m256i*) bnProcBuf)[84 + i]);
} }
p_res = &p_bnProcBufRes[252];
p_llrRes = (__m256i*) &llrRes [1536];
for (i=0;i<M;i++) { for (i=0;i<M;i++) {
p_res[i] = _mm256_subs_epi8(p_llrRes[i], p_bnProcBuf[252 + i]); ((__m256i*)bnProcBufRes)[336 + i ] = _mm256_subs_epi8(((__m256i*)llrRes)[48 + i ], ((__m256i*) bnProcBuf)[336 + i]);
} }
p_res = &p_bnProcBufRes[504];
p_llrRes = (__m256i*) &llrRes [1536];
for (i=0;i<M;i++) { for (i=0;i<M;i++) {
p_res[i] = _mm256_subs_epi8(p_llrRes[i], p_bnProcBuf[504 + i]); ((__m256i*)bnProcBufRes)[588 + i ] = _mm256_subs_epi8(((__m256i*)llrRes)[48 + i ], ((__m256i*) bnProcBuf)[588 + i]);
} }
// Process group with 4 CNs // Process group with 4 CNs
M = (1*Z + 31)>>5; M = (1*Z + 31)>>5;
p_bnProcBuf = (__m256i*) &bnProcBuf [26880];
p_bnProcBufRes = (__m256i*) &bnProcBufRes [26880];
p_res = &p_bnProcBufRes[0];
p_llrRes = (__m256i*) &llrRes [9600];
for (i=0;i<M;i++) { for (i=0;i<M;i++) {
p_res[i] = _mm256_subs_epi8(p_llrRes[i], p_bnProcBuf[0 + i]); ((__m256i*)bnProcBufRes)[840 + i ] = _mm256_subs_epi8(((__m256i*)llrRes)[300 + i ], ((__m256i*) bnProcBuf)[840 + i]);
} }
p_res = &p_bnProcBufRes[12];
p_llrRes = (__m256i*) &llrRes [9600];
for (i=0;i<M;i++) { for (i=0;i<M;i++) {
p_res[i] = _mm256_subs_epi8(p_llrRes[i], p_bnProcBuf[12 + i]); ((__m256i*)bnProcBufRes)[852 + i ] = _mm256_subs_epi8(((__m256i*)llrRes)[300 + i ], ((__m256i*) bnProcBuf)[852 + i]);
} }
p_res = &p_bnProcBufRes[24];
p_llrRes = (__m256i*) &llrRes [9600];
for (i=0;i<M;i++) { for (i=0;i<M;i++) {
p_res[i] = _mm256_subs_epi8(p_llrRes[i], p_bnProcBuf[24 + i]); ((__m256i*)bnProcBufRes)[864 + i ] = _mm256_subs_epi8(((__m256i*)llrRes)[300 + i ], ((__m256i*) bnProcBuf)[864 + i]);
} }
p_res = &p_bnProcBufRes[36];
p_llrRes = (__m256i*) &llrRes [9600];
for (i=0;i<M;i++) { for (i=0;i<M;i++) {
p_res[i] = _mm256_subs_epi8(p_llrRes[i], p_bnProcBuf[36 + i]); ((__m256i*)bnProcBufRes)[876 + i ] = _mm256_subs_epi8(((__m256i*)llrRes)[300 + i ], ((__m256i*) bnProcBuf)[876 + i]);
} }
// Process group with 5 CNs // Process group with 5 CNs
M = (1*Z + 31)>>5; M = (1*Z + 31)>>5;
p_bnProcBuf = (__m256i*) &bnProcBuf [28416];
p_bnProcBufRes = (__m256i*) &bnProcBufRes [28416];
p_res = &p_bnProcBufRes[0];
p_llrRes = (__m256i*) &llrRes [9984];
for (i=0;i<M;i++) { for (i=0;i<M;i++) {
p_res[i] = _mm256_subs_epi8(p_llrRes[i], p_bnProcBuf[0 + i]); ((__m256i*)bnProcBufRes)[888 + i ] = _mm256_subs_epi8(((__m256i*)llrRes)[312 + i ], ((__m256i*) bnProcBuf)[888 + i]);
} }
p_res = &p_bnProcBufRes[12];
p_llrRes = (__m256i*) &llrRes [9984];
for (i=0;i<M;i++) { for (i=0;i<M;i++) {
p_res[i] = _mm256_subs_epi8(p_llrRes[i], p_bnProcBuf[12 + i]); ((__m256i*)bnProcBufRes)[900 + i ] = _mm256_subs_epi8(((__m256i*)llrRes)[312 + i ], ((__m256i*) bnProcBuf)[900 + i]);
} }
p_res = &p_bnProcBufRes[24];
p_llrRes = (__m256i*) &llrRes [9984];
for (i=0;i<M;i++) { for (i=0;i<M;i++) {
p_res[i] = _mm256_subs_epi8(p_llrRes[i], p_bnProcBuf[24 + i]); ((__m256i*)bnProcBufRes)[912 + i ] = _mm256_subs_epi8(((__m256i*)llrRes)[312 + i ], ((__m256i*) bnProcBuf)[912 + i]);
} }
p_res = &p_bnProcBufRes[36];
p_llrRes = (__m256i*) &llrRes [9984];
for (i=0;i<M;i++) { for (i=0;i<M;i++) {
p_res[i] = _mm256_subs_epi8(p_llrRes[i], p_bnProcBuf[36 + i]); ((__m256i*)bnProcBufRes)[924 + i ] = _mm256_subs_epi8(((__m256i*)llrRes)[312 + i ], ((__m256i*) bnProcBuf)[924 + i]);
} }
p_res = &p_bnProcBufRes[48];
p_llrRes = (__m256i*) &llrRes [9984];
for (i=0;i<M;i++) { for (i=0;i<M;i++) {
p_res[i] = _mm256_subs_epi8(p_llrRes[i], p_bnProcBuf[48 + i]); ((__m256i*)bnProcBufRes)[936 + i ] = _mm256_subs_epi8(((__m256i*)llrRes)[312 + i ], ((__m256i*) bnProcBuf)[936 + i]);
} }
// Process group with 6 CNs // Process group with 6 CNs
// Process group with 7 CNs // Process group with 7 CNs
......
#include <stdint.h>
#include <immintrin.h>
void nrLDPC_bnProc_BG2_R23_AVX2(int8_t* bnProcBuf,int8_t* bnProcBufRes, int8_t* llrRes, uint16_t Z ) { void nrLDPC_bnProc_BG2_R23_AVX2(int8_t* bnProcBuf,int8_t* bnProcBufRes, int8_t* llrRes, uint16_t Z ) {
__m256i* p_bnProcBuf; __m256i* p_bnProcBuf;
__m256i* p_bnProcBufRes; __m256i* p_bnProcBufRes;
...@@ -8,123 +6,73 @@ void nrLDPC_bnProc_BG2_R23_AVX2(int8_t* bnProcBuf,int8_t* bnProcBufRes, int8_t* ...@@ -8,123 +6,73 @@ void nrLDPC_bnProc_BG2_R23_AVX2(int8_t* bnProcBuf,int8_t* bnProcBufRes, int8_t*
uint32_t M, i; uint32_t M, i;
// Process group with 2 CNs // Process group with 2 CNs
M = (3*Z + 31)>>5; M = (3*Z + 31)>>5;
p_bnProcBuf = (__m256i*) &bnProcBuf [1152];
p_bnProcBufRes = (__m256i*) &bnProcBufRes [1152];
p_res = &p_bnProcBufRes[0];
p_llrRes = (__m256i*) &llrRes [1152];
for (i=0;i<M;i++) { for (i=0;i<M;i++) {
p_res[i] = _mm256_subs_epi8(p_llrRes[i], p_bnProcBuf[0 + i]); ((__m256i*)bnProcBufRes)[36 + i ] = _mm256_subs_epi8(((__m256i*)llrRes)[36 + i ], ((__m256i*) bnProcBuf)[36 + i]);
} }
p_res = &p_bnProcBufRes[36];
p_llrRes = (__m256i*) &llrRes [1152];
for (i=0;i<M;i++) { for (i=0;i<M;i++) {
p_res[i] = _mm256_subs_epi8(p_llrRes[i], p_bnProcBuf[36 + i]); ((__m256i*)bnProcBufRes)[72 + i ] = _mm256_subs_epi8(((__m256i*)llrRes)[36 + i ], ((__m256i*) bnProcBuf)[72 + i]);
} }
// Process group with 3 CNs // Process group with 3 CNs
M = (5*Z + 31)>>5; M = (5*Z + 31)>>5;
p_bnProcBuf = (__m256i*) &bnProcBuf [3456];
p_bnProcBufRes = (__m256i*) &bnProcBufRes [3456];
p_res = &p_bnProcBufRes[0];
p_llrRes = (__m256i*) &llrRes [2304];
for (i=0;i<M;i++) { for (i=0;i<M;i++) {
p_res[i] = _mm256_subs_epi8(p_llrRes[i], p_bnProcBuf[0 + i]); ((__m256i*)bnProcBufRes)[108 + i ] = _mm256_subs_epi8(((__m256i*)llrRes)[72 + i ], ((__m256i*) bnProcBuf)[108 + i]);
} }
p_res = &p_bnProcBufRes[60];
p_llrRes = (__m256i*) &llrRes [2304];
for (i=0;i<M;i++) { for (i=0;i<M;i++) {
p_res[i] = _mm256_subs_epi8(p_llrRes[i], p_bnProcBuf[60 + i]); ((__m256i*)bnProcBufRes)[168 + i ] = _mm256_subs_epi8(((__m256i*)llrRes)[72 + i ], ((__m256i*) bnProcBuf)[168 + i]);
} }
p_res = &p_bnProcBufRes[120];
p_llrRes = (__m256i*) &llrRes [2304];
for (i=0;i<M;i++) { for (i=0;i<M;i++) {
p_res[i] = _mm256_subs_epi8(p_llrRes[i], p_bnProcBuf[120 + i]); ((__m256i*)bnProcBufRes)[228 + i ] = _mm256_subs_epi8(((__m256i*)llrRes)[72 + i ], ((__m256i*) bnProcBuf)[228 + i]);
} }
// Process group with 4 CNs // Process group with 4 CNs
M = (3*Z + 31)>>5; M = (3*Z + 31)>>5;
p_bnProcBuf = (__m256i*) &bnProcBuf [9216];
p_bnProcBufRes = (__m256i*) &bnProcBufRes [9216];
p_res = &p_bnProcBufRes[0];
p_llrRes = (__m256i*) &llrRes [4224];
for (i=0;i<M;i++) { for (i=0;i<M;i++) {
p_res[i] = _mm256_subs_epi8(p_llrRes[i], p_bnProcBuf[0 + i]); ((__m256i*)bnProcBufRes)[288 + i ] = _mm256_subs_epi8(((__m256i*)llrRes)[132 + i ], ((__m256i*) bnProcBuf)[288 + i]);
} }
p_res = &p_bnProcBufRes[36];
p_llrRes = (__m256i*) &llrRes [4224];
for (i=0;i<M;i++) { for (i=0;i<M;i++) {
p_res[i] = _mm256_subs_epi8(p_llrRes[i], p_bnProcBuf[36 + i]); ((__m256i*)bnProcBufRes)[324 + i ] = _mm256_subs_epi8(((__m256i*)llrRes)[132 + i ], ((__m256i*) bnProcBuf)[324 + i]);
} }
p_res = &p_bnProcBufRes[72];
p_llrRes = (__m256i*) &llrRes [4224];
for (i=0;i<M;i++) { for (i=0;i<M;i++) {
p_res[i] = _mm256_subs_epi8(p_llrRes[i], p_bnProcBuf[72 + i]); ((__m256i*)bnProcBufRes)[360 + i ] = _mm256_subs_epi8(((__m256i*)llrRes)[132 + i ], ((__m256i*) bnProcBuf)[360 + i]);
} }
p_res = &p_bnProcBufRes[108];
p_llrRes = (__m256i*) &llrRes [4224];
for (i=0;i<M;i++) { for (i=0;i<M;i++) {
p_res[i] = _mm256_subs_epi8(p_llrRes[i], p_bnProcBuf[108 + i]); ((__m256i*)bnProcBufRes)[396 + i ] = _mm256_subs_epi8(((__m256i*)llrRes)[132 + i ], ((__m256i*) bnProcBuf)[396 + i]);
} }
// Process group with 5 CNs // Process group with 5 CNs
M = (2*Z + 31)>>5; M = (2*Z + 31)>>5;
p_bnProcBuf = (__m256i*) &bnProcBuf [13824];
p_bnProcBufRes = (__m256i*) &bnProcBufRes [13824];
p_res = &p_bnProcBufRes[0];
p_llrRes = (__m256i*) &llrRes [5376];
for (i=0;i<M;i++) { for (i=0;i<M;i++) {
p_res[i] = _mm256_subs_epi8(p_llrRes[i], p_bnProcBuf[0 + i]); ((__m256i*)bnProcBufRes)[432 + i ] = _mm256_subs_epi8(((__m256i*)llrRes)[168 + i ], ((__m256i*) bnProcBuf)[432 + i]);
} }
p_res = &p_bnProcBufRes[24];
p_llrRes = (__m256i*) &llrRes [5376];
for (i=0;i<M;i++) { for (i=0;i<M;i++) {
p_res[i] = _mm256_subs_epi8(p_llrRes[i], p_bnProcBuf[24 + i]); ((__m256i*)bnProcBufRes)[456 + i ] = _mm256_subs_epi8(((__m256i*)llrRes)[168 + i ], ((__m256i*) bnProcBuf)[456 + i]);
} }
p_res = &p_bnProcBufRes[48];
p_llrRes = (__m256i*) &llrRes [5376];
for (i=0;i<M;i++) { for (i=0;i<M;i++) {
p_res[i] = _mm256_subs_epi8(p_llrRes[i], p_bnProcBuf[48 + i]); ((__m256i*)bnProcBufRes)[480 + i ] = _mm256_subs_epi8(((__m256i*)llrRes)[168 + i ], ((__m256i*) bnProcBuf)[480 + i]);
} }
p_res = &p_bnProcBufRes[72];
p_llrRes = (__m256i*) &llrRes [5376];
for (i=0;i<M;i++) { for (i=0;i<M;i++) {
p_res[i] = _mm256_subs_epi8(p_llrRes[i], p_bnProcBuf[72 + i]); ((__m256i*)bnProcBufRes)[504 + i ] = _mm256_subs_epi8(((__m256i*)llrRes)[168 + i ], ((__m256i*) bnProcBuf)[504 + i]);
} }
p_res = &p_bnProcBufRes[96];
p_llrRes = (__m256i*) &llrRes [5376];
for (i=0;i<M;i++) { for (i=0;i<M;i++) {
p_res[i] = _mm256_subs_epi8(p_llrRes[i], p_bnProcBuf[96 + i]); ((__m256i*)bnProcBufRes)[528 + i ] = _mm256_subs_epi8(((__m256i*)llrRes)[168 + i ], ((__m256i*) bnProcBuf)[528 + i]);
} }
// Process group with 6 CNs // Process group with 6 CNs
M = (1*Z + 31)>>5; M = (1*Z + 31)>>5;
p_bnProcBuf = (__m256i*) &bnProcBuf [17664];
p_bnProcBufRes = (__m256i*) &bnProcBufRes [17664];
p_res = &p_bnProcBufRes[0];
p_llrRes = (__m256i*) &llrRes [6144];
for (i=0;i<M;i++) { for (i=0;i<M;i++) {
p_res[i] = _mm256_subs_epi8(p_llrRes[i], p_bnProcBuf[0 + i]); ((__m256i*)bnProcBufRes)[552 + i ] = _mm256_subs_epi8(((__m256i*)llrRes)[192 + i ], ((__m256i*) bnProcBuf)[552 + i]);
} }
p_res = &p_bnProcBufRes[12];
p_llrRes = (__m256i*) &llrRes [6144];
for (i=0;i<M;i++) { for (i=0;i<M;i++) {
p_res[i] = _mm256_subs_epi8(p_llrRes[i], p_bnProcBuf[12 + i]); ((__m256i*)bnProcBufRes)[564 + i ] = _mm256_subs_epi8(((__m256i*)llrRes)[192 + i ], ((__m256i*) bnProcBuf)[564 + i]);
} }
p_res = &p_bnProcBufRes[24];
p_llrRes = (__m256i*) &llrRes [6144];
for (i=0;i<M;i++) { for (i=0;i<M;i++) {
p_res[i] = _mm256_subs_epi8(p_llrRes[i], p_bnProcBuf[24 + i]); ((__m256i*)bnProcBufRes)[576 + i ] = _mm256_subs_epi8(((__m256i*)llrRes)[192 + i ], ((__m256i*) bnProcBuf)[576 + i]);
} }
p_res = &p_bnProcBufRes[36];
p_llrRes = (__m256i*) &llrRes [6144];
for (i=0;i<M;i++) { for (i=0;i<M;i++) {
p_res[i] = _mm256_subs_epi8(p_llrRes[i], p_bnProcBuf[36 + i]); ((__m256i*)bnProcBufRes)[588 + i ] = _mm256_subs_epi8(((__m256i*)llrRes)[192 + i ], ((__m256i*) bnProcBuf)[588 + i]);
} }
p_res = &p_bnProcBufRes[48];
p_llrRes = (__m256i*) &llrRes [6144];
for (i=0;i<M;i++) { for (i=0;i<M;i++) {
p_res[i] = _mm256_subs_epi8(p_llrRes[i], p_bnProcBuf[48 + i]); ((__m256i*)bnProcBufRes)[600 + i ] = _mm256_subs_epi8(((__m256i*)llrRes)[192 + i ], ((__m256i*) bnProcBuf)[600 + i]);
} }
p_res = &p_bnProcBufRes[60];
p_llrRes = (__m256i*) &llrRes [6144];
for (i=0;i<M;i++) { for (i=0;i<M;i++) {
p_res[i] = _mm256_subs_epi8(p_llrRes[i], p_bnProcBuf[60 + i]); ((__m256i*)bnProcBufRes)[612 + i ] = _mm256_subs_epi8(((__m256i*)llrRes)[192 + i ], ((__m256i*) bnProcBuf)[612 + i]);
} }
// Process group with 7 CNs // Process group with 7 CNs
// Process group with 8 CNs // Process group with 8 CNs
......
static inline void nrLDPC_bnProcPc_BG1_R13_AVX2(int8_t* bnProcBuf,int8_t* llrRes , int8_t* llrProcBuf, uint16_t Z ) { static inline void nrLDPC_bnProcPc_BG1_R13_AVX2(int8_t* bnProcBuf,int8_t* bnProcBufRes,int8_t* llrRes , int8_t* llrProcBuf, uint16_t Z ) {
__m256i ymm0, ymm1, ymmRes0, ymmRes1; __m256i ymm0, ymm1, ymmRes0, ymmRes1;
__m128i* p_bnProcBuf; __m128i* p_bnProcBuf;
__m128i* p_llrProcBuf; __m128i* p_llrProcBuf;
......
static inline void nrLDPC_bnProcPc_BG1_R23_AVX2(int8_t* bnProcBuf,int8_t* llrRes , int8_t* llrProcBuf, uint16_t Z ) { static inline void nrLDPC_bnProcPc_BG1_R23_AVX2(int8_t* bnProcBuf,int8_t* bnProcBufRes,int8_t* llrRes , int8_t* llrProcBuf, uint16_t Z ) {
__m256i ymm0, ymm1, ymmRes0, ymmRes1; __m256i ymm0, ymm1, ymmRes0, ymmRes1;
__m128i* p_bnProcBuf; __m128i* p_bnProcBuf;
__m128i* p_llrProcBuf; __m128i* p_llrProcBuf;
...@@ -11,8 +11,8 @@ static inline void nrLDPC_bnProcPc_BG1_R23_AVX2(int8_t* bnProcBuf,int8_t* llrRes ...@@ -11,8 +11,8 @@ static inline void nrLDPC_bnProcPc_BG1_R23_AVX2(int8_t* bnProcBuf,int8_t* llrRes
p_llrProcBuf = (__m128i*) &llrProcBuf [3456]; p_llrProcBuf = (__m128i*) &llrProcBuf [3456];
p_llrRes = (__m256i*) &llrRes [3456]; p_llrRes = (__m256i*) &llrRes [3456];
for (int i=0,j=0;i<M;i++,j+=2) { for (int i=0,j=0;i<M;i++,j+=2) {
ymm0 = _mm256_cvtepi8_epi16(p_bnProcBuf [j]); ymmRes0 = _mm256_cvtepi8_epi16(p_bnProcBuf [j]);
ymm1 = _mm256_cvtepi8_epi16(p_bnProcBuf[j + 1]); ymmRes1 = _mm256_cvtepi8_epi16(p_bnProcBuf[j + 1]);
ymm0 = _mm256_cvtepi8_epi16(p_bnProcBuf[24 + j]); ymm0 = _mm256_cvtepi8_epi16(p_bnProcBuf[24 + j]);
ymmRes0 = _mm256_adds_epi16(ymmRes0, ymm0); ymmRes0 = _mm256_adds_epi16(ymmRes0, ymm0);
ymm1 = _mm256_cvtepi8_epi16(p_bnProcBuf[24 + j +1]); ymm1 = _mm256_cvtepi8_epi16(p_bnProcBuf[24 + j +1]);
......
static inline void nrLDPC_bnProcPc_BG1_R89_AVX2(int8_t* bnProcBuf,int8_t* llrRes , int8_t* llrProcBuf, uint16_t Z ) { static inline void nrLDPC_bnProcPc_BG1_R89_AVX2(int8_t* bnProcBuf,int8_t* bnProcBufRes,int8_t* llrRes , int8_t* llrProcBuf, uint16_t Z ) {
__m256i ymm0, ymm1, ymmRes0, ymmRes1; __m256i ymm0, ymm1, ymmRes0, ymmRes1;
__m128i* p_bnProcBuf; __m128i* p_bnProcBuf;
__m128i* p_llrProcBuf; __m128i* p_llrProcBuf;
...@@ -11,8 +11,8 @@ static inline void nrLDPC_bnProcPc_BG1_R89_AVX2(int8_t* bnProcBuf,int8_t* llrRes ...@@ -11,8 +11,8 @@ static inline void nrLDPC_bnProcPc_BG1_R89_AVX2(int8_t* bnProcBuf,int8_t* llrRes
p_llrProcBuf = (__m128i*) &llrProcBuf [384]; p_llrProcBuf = (__m128i*) &llrProcBuf [384];
p_llrRes = (__m256i*) &llrRes [384]; p_llrRes = (__m256i*) &llrRes [384];
for (int i=0,j=0;i<M;i++,j+=2) { for (int i=0,j=0;i<M;i++,j+=2) {
ymm0 = _mm256_cvtepi8_epi16(p_bnProcBuf [j]); ymmRes0 = _mm256_cvtepi8_epi16(p_bnProcBuf [j]);
ymm1 = _mm256_cvtepi8_epi16(p_bnProcBuf[j + 1]); ymmRes1 = _mm256_cvtepi8_epi16(p_bnProcBuf[j + 1]);
ymm0 = _mm256_cvtepi8_epi16(p_bnProcBuf[72 + j]); ymm0 = _mm256_cvtepi8_epi16(p_bnProcBuf[72 + j]);
ymmRes0 = _mm256_adds_epi16(ymmRes0, ymm0); ymmRes0 = _mm256_adds_epi16(ymmRes0, ymm0);
ymm1 = _mm256_cvtepi8_epi16(p_bnProcBuf[72 + j +1]); ymm1 = _mm256_cvtepi8_epi16(p_bnProcBuf[72 + j +1]);
......
static inline void nrLDPC_bnProcPc_BG2_R13_AVX2(int8_t* bnProcBuf,int8_t* llrRes , int8_t* llrProcBuf, uint16_t Z ) { static inline void nrLDPC_bnProcPc_BG2_R13_AVX2(int8_t* bnProcBuf,int8_t* bnProcBufRes,int8_t* llrRes , int8_t* llrProcBuf, uint16_t Z ) {
__m256i ymm0, ymm1, ymmRes0, ymmRes1; __m256i ymm0, ymm1, ymmRes0, ymmRes1;
__m128i* p_bnProcBuf; __m128i* p_bnProcBuf;
__m128i* p_llrProcBuf; __m128i* p_llrProcBuf;
__m256i* p_llrRes; __m256i* p_llrRes;
uint32_t M ; uint32_t M ;
// Process group with 1 CNs // Process group with 1 CNs
M = (18*Z + 31)>>5;
p_bnProcBuf = (__m128i*) &bnProcBuf [0];
p_llrProcBuf = (__m128i*) &llrProcBuf [0];
p_llrRes = (__m256i*) &llrRes [0];
for (int i=0,j=0;i<M;i++,j+=2) {
ymm0 = _mm256_cvtepi8_epi16(p_bnProcBuf [j]);
ymm1 = _mm256_cvtepi8_epi16(p_llrProcBuf[j]);
ymm0 = _mm256_cvtepi8_epi16(p_bnProcBuf[j+1]);
ymmRes0 = _mm256_adds_epi16(ymmRes0, ymm0);
ymm1 = _mm256_cvtepi8_epi16(p_llrProcBuf[j +1 ]);
ymmRes1 = _mm256_adds_epi16(ymmRes1, ymm1);
ymm0 = _mm256_packs_epi16(ymmRes0, ymmRes1);
p_llrRes[i] = _mm256_permute4x64_epi64(ymm0, 0xD8);
}
// Process group with 2 CNs // Process group with 2 CNs
M = (1*Z + 31)>>5; M = (1*Z + 31)>>5;
p_bnProcBuf = (__m128i*) &bnProcBuf [6912]; p_bnProcBuf = (__m128i*) &bnProcBuf [6912];
p_llrProcBuf = (__m128i*) &llrProcBuf [6912]; p_llrProcBuf = (__m128i*) &llrProcBuf [6912];
p_llrRes = (__m256i*) &llrRes [6912]; p_llrRes = (__m256i*) &llrRes [6912];
for (int i=0,j=0;i<M;i++,j+=2) { for (int i=0,j=0;i<M;i++,j+=2) {
ymm0 = _mm256_cvtepi8_epi16(p_bnProcBuf [j]); ymmRes0 = _mm256_cvtepi8_epi16(p_bnProcBuf [j]);
ymm1 = _mm256_cvtepi8_epi16(p_bnProcBuf[j + 1]); ymmRes1 = _mm256_cvtepi8_epi16(p_bnProcBuf[j + 1]);
ymm0 = _mm256_cvtepi8_epi16(p_bnProcBuf[24 + j]); ymm0 = _mm256_cvtepi8_epi16(p_bnProcBuf[24 + j]);
ymmRes0 = _mm256_adds_epi16(ymmRes0, ymm0); ymmRes0 = _mm256_adds_epi16(ymmRes0, ymm0);
ymm1 = _mm256_cvtepi8_epi16(p_bnProcBuf[24 + j +1]); ymm1 = _mm256_cvtepi8_epi16(p_bnProcBuf[24 + j +1]);
......
static inline void nrLDPC_bnProcPc_BG2_R15_AVX2(int8_t* bnProcBuf,int8_t* llrRes , int8_t* llrProcBuf, uint16_t Z ) { static inline void nrLDPC_bnProcPc_BG2_R15_AVX2(int8_t* bnProcBuf,int8_t* bnProcBufRes,int8_t* llrRes , int8_t* llrProcBuf, uint16_t Z ) {
__m256i ymm0, ymm1, ymmRes0, ymmRes1; __m256i ymm0, ymm1, ymmRes0, ymmRes1;
__m128i* p_bnProcBuf; __m128i* p_bnProcBuf;
__m128i* p_llrProcBuf; __m128i* p_llrProcBuf;
__m256i* p_llrRes; __m256i* p_llrRes;
uint32_t M ; uint32_t M ;
// Process group with 1 CNs // Process group with 1 CNs
M = (38*Z + 31)>>5;
p_bnProcBuf = (__m128i*) &bnProcBuf [0];
p_llrProcBuf = (__m128i*) &llrProcBuf [0];
p_llrRes = (__m256i*) &llrRes [0];
for (int i=0,j=0;i<M;i++,j+=2) {
ymm0 = _mm256_cvtepi8_epi16(p_bnProcBuf [j]);
ymm1 = _mm256_cvtepi8_epi16(p_llrProcBuf[j]);
ymm0 = _mm256_cvtepi8_epi16(p_bnProcBuf[j+1]);
ymmRes0 = _mm256_adds_epi16(ymmRes0, ymm0);
ymm1 = _mm256_cvtepi8_epi16(p_llrProcBuf[j +1 ]);
ymmRes1 = _mm256_adds_epi16(ymmRes1, ymm1);
ymm0 = _mm256_packs_epi16(ymmRes0, ymmRes1);
p_llrRes[i] = _mm256_permute4x64_epi64(ymm0, 0xD8);
}
// Process group with 2 CNs // Process group with 2 CNs
// Process group with 3 CNs // Process group with 3 CNs
// Process group with 4 CNs // Process group with 4 CNs
......
...@@ -25,8 +25,8 @@ static inline void nrLDPC_bnProcPc_BG1_R23_AVX512(int8_t* bnProcBuf,int8_t* llrR ...@@ -25,8 +25,8 @@ static inline void nrLDPC_bnProcPc_BG1_R23_AVX512(int8_t* bnProcBuf,int8_t* llrR
p_llrProcBuf = (__m256i*) &llrProcBuf [3456]; p_llrProcBuf = (__m256i*) &llrProcBuf [3456];
p_llrRes = (__m512i*) &llrRes [3456]; p_llrRes = (__m512i*) &llrRes [3456];
for (int i=0,j=0;i<M;i++,j+=2) { for (int i=0,j=0;i<M;i++,j+=2) {
zmm0 = _mm512_cvtepi8_epi16(p_bnProcBuf [j]); zmmRes0 = _mm512_cvtepi8_epi16(p_bnProcBuf [j]);
zmm1 = _mm512_cvtepi8_epi16(p_bnProcBuf[j + 1]); zmmRes1 = _mm512_cvtepi8_epi16(p_bnProcBuf[j + 1]);
zmm0 = _mm512_cvtepi8_epi16(p_bnProcBuf[12 + j]); zmm0 = _mm512_cvtepi8_epi16(p_bnProcBuf[12 + j]);
zmmRes0 = _mm512_adds_epi16(zmmRes0, zmm0); zmmRes0 = _mm512_adds_epi16(zmmRes0, zmm0);
zmm1 = _mm512_cvtepi8_epi16(p_bnProcBuf[12 + j +1]); zmm1 = _mm512_cvtepi8_epi16(p_bnProcBuf[12 + j +1]);
......
...@@ -25,8 +25,8 @@ static inline void nrLDPC_bnProcPc_BG1_R89_AVX512(int8_t* bnProcBuf,int8_t* llrR ...@@ -25,8 +25,8 @@ static inline void nrLDPC_bnProcPc_BG1_R89_AVX512(int8_t* bnProcBuf,int8_t* llrR
p_llrProcBuf = (__m256i*) &llrProcBuf [384]; p_llrProcBuf = (__m256i*) &llrProcBuf [384];
p_llrRes = (__m512i*) &llrRes [384]; p_llrRes = (__m512i*) &llrRes [384];
for (int i=0,j=0;i<M;i++,j+=2) { for (int i=0,j=0;i<M;i++,j+=2) {
zmm0 = _mm512_cvtepi8_epi16(p_bnProcBuf [j]); zmmRes0 = _mm512_cvtepi8_epi16(p_bnProcBuf [j]);
zmm1 = _mm512_cvtepi8_epi16(p_bnProcBuf[j + 1]); zmmRes1 = _mm512_cvtepi8_epi16(p_bnProcBuf[j + 1]);
zmm0 = _mm512_cvtepi8_epi16(p_bnProcBuf[36 + j]); zmm0 = _mm512_cvtepi8_epi16(p_bnProcBuf[36 + j]);
zmmRes0 = _mm512_adds_epi16(zmmRes0, zmm0); zmmRes0 = _mm512_adds_epi16(zmmRes0, zmm0);
zmm1 = _mm512_cvtepi8_epi16(p_bnProcBuf[36 + j +1]); zmm1 = _mm512_cvtepi8_epi16(p_bnProcBuf[36 + j +1]);
......
...@@ -25,8 +25,8 @@ static inline void nrLDPC_bnProcPc_BG2_R13_AVX512(int8_t* bnProcBuf,int8_t* llrR ...@@ -25,8 +25,8 @@ static inline void nrLDPC_bnProcPc_BG2_R13_AVX512(int8_t* bnProcBuf,int8_t* llrR
p_llrProcBuf = (__m256i*) &llrProcBuf [6912]; p_llrProcBuf = (__m256i*) &llrProcBuf [6912];
p_llrRes = (__m512i*) &llrRes [6912]; p_llrRes = (__m512i*) &llrRes [6912];
for (int i=0,j=0;i<M;i++,j+=2) { for (int i=0,j=0;i<M;i++,j+=2) {
zmm0 = _mm512_cvtepi8_epi16(p_bnProcBuf [j]); zmmRes0 = _mm512_cvtepi8_epi16(p_bnProcBuf [j]);
zmm1 = _mm512_cvtepi8_epi16(p_bnProcBuf[j + 1]); zmmRes1 = _mm512_cvtepi8_epi16(p_bnProcBuf[j + 1]);
zmm0 = _mm512_cvtepi8_epi16(p_bnProcBuf[12 + j]); zmm0 = _mm512_cvtepi8_epi16(p_bnProcBuf[12 + j]);
zmmRes0 = _mm512_adds_epi16(zmmRes0,zmm0); zmmRes0 = _mm512_adds_epi16(zmmRes0,zmm0);
zmm1 = _mm512_cvtepi8_epi16(p_bnProcBuf[12 + j +1]); zmm1 = _mm512_cvtepi8_epi16(p_bnProcBuf[12 + j +1]);
......
Markdown is supported
0%
or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment