Commit 845a2264 authored by Sy's avatar Sy

optimized 5G NR LDPC decoder

parent c8a787d2
/*
* Licensed to the OpenAirInterface (OAI) Software Alliance under one or more
* contributor license agreements. See the NOTICE file distributed with
......@@ -328,17 +329,17 @@ static inline uint32_t nrLDPC_decoder_core(int8_t* p_llr, int8_t* p_out, t_nrLDP
{
case 13:
{
nrLDPC_bnProcPc_BG1_R13_AVX2(p_procBuf->bnProcBuf,p_procBuf->llrRes, p_procBuf->llrProcBuf, Z);
nrLDPC_bnProcPc_BG1_R13_AVX2(p_procBuf->bnProcBuf,p_procBuf->bnProcBufRes,p_procBuf->llrRes, p_procBuf->llrProcBuf, Z);
break;
}
case 23:
{
nrLDPC_bnProcPc_BG1_R23_AVX2(p_procBuf->bnProcBuf,p_procBuf->llrRes, p_procBuf->llrProcBuf, Z);
nrLDPC_bnProcPc_BG1_R23_AVX2(p_procBuf->bnProcBuf,p_procBuf->bnProcBufRes, p_procBuf->llrRes, p_procBuf->llrProcBuf, Z);
break;
}
case 89:
{
nrLDPC_bnProcPc_BG1_R89_AVX2(p_procBuf->bnProcBuf,p_procBuf->llrRes, p_procBuf->llrProcBuf, Z);
nrLDPC_bnProcPc_BG1_R89_AVX2(p_procBuf->bnProcBuf,p_procBuf->bnProcBufRes, p_procBuf->llrRes, p_procBuf->llrProcBuf, Z);
break;
}
}
......@@ -349,20 +350,20 @@ static inline uint32_t nrLDPC_decoder_core(int8_t* p_llr, int8_t* p_out, t_nrLDP
{
case 15:
{
nrLDPC_bnProcPc_BG2_R15_AVX2(p_procBuf->bnProcBuf,p_procBuf->llrRes, p_procBuf->llrProcBuf, Z);
nrLDPC_bnProcPc_BG2_R15_AVX2(p_procBuf->bnProcBuf,p_procBuf->bnProcBufRes, p_procBuf->llrRes, p_procBuf->llrProcBuf, Z);
break;
}
case 13:
{
nrLDPC_bnProcPc_BG2_R13_AVX2(p_procBuf->bnProcBuf,p_procBuf->llrRes, p_procBuf->llrProcBuf, Z);
nrLDPC_bnProcPc_BG2_R13_AVX2(p_procBuf->bnProcBuf,p_procBuf->bnProcBufRes,p_procBuf->llrRes, p_procBuf->llrProcBuf, Z);
break;
}
case 23:
{
nrLDPC_bnProcPc_BG2_R23_AVX2(p_procBuf->bnProcBuf,p_procBuf->llrRes, p_procBuf->llrProcBuf, Z);
nrLDPC_bnProcPc_BG2_R23_AVX2(p_procBuf->bnProcBuf,p_procBuf->bnProcBufRes,p_procBuf->llrRes, p_procBuf->llrProcBuf, Z);
break;
}
......@@ -614,17 +615,17 @@ if (BG==1)
{
case 13:
{
nrLDPC_bnProcPc_BG1_R13_AVX2(p_procBuf->bnProcBuf,p_procBuf->llrRes, p_procBuf->llrProcBuf, Z);
nrLDPC_bnProcPc_BG1_R13_AVX2(p_procBuf->bnProcBuf,p_procBuf->bnProcBufRes,p_procBuf->llrRes, p_procBuf->llrProcBuf, Z);
break;
}
case 23:
{
nrLDPC_bnProcPc_BG1_R23_AVX2(p_procBuf->bnProcBuf,p_procBuf->llrRes, p_procBuf->llrProcBuf, Z);
nrLDPC_bnProcPc_BG1_R23_AVX2(p_procBuf->bnProcBuf,p_procBuf->bnProcBufRes, p_procBuf->llrRes, p_procBuf->llrProcBuf, Z);
break;
}
case 89:
{
nrLDPC_bnProcPc_BG1_R89_AVX2(p_procBuf->bnProcBuf,p_procBuf->llrRes, p_procBuf->llrProcBuf, Z);
nrLDPC_bnProcPc_BG1_R89_AVX2(p_procBuf->bnProcBuf,p_procBuf->bnProcBufRes, p_procBuf->llrRes, p_procBuf->llrProcBuf, Z);
break;
}
}
......@@ -635,20 +636,20 @@ if (BG==1)
{
case 15:
{
nrLDPC_bnProcPc_BG2_R15_AVX2(p_procBuf->bnProcBuf,p_procBuf->llrRes, p_procBuf->llrProcBuf, Z);
nrLDPC_bnProcPc_BG2_R15_AVX2(p_procBuf->bnProcBuf,p_procBuf->bnProcBufRes,p_procBuf->llrRes, p_procBuf->llrProcBuf, Z);
break;
}
case 13:
{
nrLDPC_bnProcPc_BG2_R13_AVX2(p_procBuf->bnProcBuf,p_procBuf->llrRes, p_procBuf->llrProcBuf, Z);
nrLDPC_bnProcPc_BG2_R13_AVX2(p_procBuf->bnProcBuf,p_procBuf->bnProcBufRes,p_procBuf->llrRes, p_procBuf->llrProcBuf, Z);
break;
}
case 23:
{
nrLDPC_bnProcPc_BG2_R23_AVX2(p_procBuf->bnProcBuf,p_procBuf->llrRes, p_procBuf->llrProcBuf, Z);
nrLDPC_bnProcPc_BG2_R23_AVX2(p_procBuf->bnProcBuf,p_procBuf->bnProcBufRes,p_procBuf->llrRes, p_procBuf->llrProcBuf, Z);
break;
}
......@@ -723,7 +724,7 @@ if (BG==1)
#ifdef __AVX512BW__
nrLDPC_bnProc_BG2_R13_AVX512(p_procBuf->bnProcBuf, p_procBuf->bnProcBufRes,p_procBuf->llrRes, Z);
#else
nrLDPC_bnProc_BG2_R13_AVX2(p_procBuf->bnProcBuf, p_procBuf->bnProcBufRes,p_procBuf->llrRes, Z);
nrLDPC_bnProc_BG2_R13_AVX2(p_procBuf->bnProcBuf, p_procBuf->bnProcBufRes,+p_procBuf->llrRes, Z);
#endif
break;
}
......@@ -906,7 +907,7 @@ if (BG==1)
#ifdef NR_LDPC_PROFILER_DETAIL
start_meas(&p_profiler->bnProcPc);
#endif
//nrLDPC_bnProcPc(p_lut, p_procBuf, Z);
// nrLDPC_bnProcPc(p_lut, p_procBuf, Z);
if (BG==1)
{
......@@ -914,17 +915,17 @@ if (BG==1)
{
case 13:
{
nrLDPC_bnProcPc_BG1_R13_AVX2(p_procBuf->bnProcBuf,p_procBuf->llrRes, p_procBuf->llrProcBuf, Z);
nrLDPC_bnProcPc_BG1_R13_AVX2(p_procBuf->bnProcBuf,p_procBuf->bnProcBufRes,p_procBuf->llrRes, p_procBuf->llrProcBuf, Z);
break;
}
case 23:
{
nrLDPC_bnProcPc_BG1_R23_AVX2(p_procBuf->bnProcBuf,p_procBuf->llrRes, p_procBuf->llrProcBuf, Z);
nrLDPC_bnProcPc_BG1_R23_AVX2(p_procBuf->bnProcBuf,p_procBuf->bnProcBufRes, p_procBuf->llrRes, p_procBuf->llrProcBuf, Z);
break;
}
case 89:
{
nrLDPC_bnProcPc_BG1_R89_AVX2(p_procBuf->bnProcBuf,p_procBuf->llrRes, p_procBuf->llrProcBuf, Z);
nrLDPC_bnProcPc_BG1_R89_AVX2(p_procBuf->bnProcBuf,p_procBuf->bnProcBufRes, p_procBuf->llrRes, p_procBuf->llrProcBuf, Z);
break;
}
}
......@@ -935,20 +936,20 @@ if (BG==1)
{
case 15:
{
nrLDPC_bnProcPc_BG2_R15_AVX2(p_procBuf->bnProcBuf, p_procBuf->llrRes, p_procBuf->llrProcBuf, Z);
nrLDPC_bnProcPc_BG2_R15_AVX2(p_procBuf->bnProcBuf,p_procBuf->bnProcBufRes, p_procBuf->llrRes, p_procBuf->llrProcBuf, Z);
break;
}
case 13:
{
nrLDPC_bnProcPc_BG2_R13_AVX2(p_procBuf->bnProcBuf,p_procBuf->llrRes, p_procBuf->llrProcBuf, Z);
nrLDPC_bnProcPc_BG2_R13_AVX2(p_procBuf->bnProcBuf,p_procBuf->bnProcBufRes,p_procBuf->llrRes, p_procBuf->llrProcBuf, Z);
break;
}
case 23:
{
nrLDPC_bnProcPc_BG2_R23_AVX2(p_procBuf->bnProcBuf,p_procBuf->llrRes, p_procBuf->llrProcBuf, Z);
nrLDPC_bnProcPc_BG2_R23_AVX2(p_procBuf->bnProcBuf,p_procBuf->bnProcBufRes,p_procBuf->llrRes, p_procBuf->llrProcBuf, Z);
break;
}
......@@ -1136,3 +1137,4 @@ if (BG==1)
#include <stdint.h>
#include <immintrin.h>
#include "../../nrLDPCdecoder_defs.h"
......@@ -25,7 +21,7 @@ void nrLDPC_bnProcPc_BG2_generator_AVX2(int R)
// fprintf(fd,"#include <stdint.h>\n");
//fprintf(fd,"#include <immintrin.h>\n");
fprintf(fd,"static inline void nrLDPC_bnProcPc_BG2_R%s_AVX2(int8_t* bnProcBuf,int8_t* llrRes , int8_t* llrProcBuf, uint16_t Z ) {\n",ratestr[R]);
fprintf(fd,"static inline void nrLDPC_bnProcPc_BG2_R%s_AVX2(int8_t* bnProcBuf,int8_t* bnProcBufRes,int8_t* llrRes , int8_t* llrProcBuf, uint16_t Z ) {\n",ratestr[R]);
const uint8_t* lut_numBnInBnGroups;
const uint32_t* lut_startAddrBnGroups;
const uint16_t* lut_startAddrBnGroupsLlr;
......@@ -51,10 +47,9 @@ void nrLDPC_bnProcPc_BG2_generator_AVX2(int R)
}
else { printf("aborting, illegal R %d\n",R); fclose(fd);abort();}
// Number of BNs in Groups
// uint32_t M;
//uint32_t M32rem;
//uint32_t i,j;
uint32_t k;
// Offset to each bit within a group in terms of 32 Byte
uint32_t cnOffsetInGroup;
......@@ -66,16 +61,18 @@ void nrLDPC_bnProcPc_BG2_generator_AVX2(int R)
fprintf(fd," __m128i* p_bnProcBuf; \n");
fprintf(fd," __m128i* p_llrProcBuf;\n");
fprintf(fd," __m256i* p_llrRes; \n");
// fprintf(fd," __m256i* p_bnProcBufRes; \n");
// fprintf(fd," __m256i* p_llrProcBuf256; \n");
fprintf(fd," uint32_t M ;\n");
fprintf(fd, "// Process group with 1 CNs \n");
fprintf(fd, "// Process group with 1 CNs \n");
/*
// Process group with 2 CNs
// Process group with 1 CNs
if (lut_numBnInBnGroups[0] > 0)
{
// if (lut_numBnInBnGroups[0] > 0)
// {
// If elements in group move to next address
// idxBnGroup++;
......@@ -83,36 +80,30 @@ fprintf(fd, "// Process group with 1 CNs \n");
fprintf(fd," M = (%d*Z + 31)>>5;\n",lut_numBnInBnGroups[0] );
// Set the offset to each CN within a group in terms of 16 Byte
cnOffsetInGroup = (lut_numBnInBnGroups[0]*NR_LDPC_ZMAX)>>4;
// cnOffsetInGroup = (lut_numBnInBnGroups[0]*NR_LDPC_ZMAX)>>4;
// Set pointers to start of group 2
fprintf(fd," p_bnProcBuf = (__m128i*) &bnProcBuf [%d];\n",lut_startAddrBnGroups[idxBnGroup]);
fprintf(fd," p_bnProcBufRes = (__m256i*) &bnProcBufRes [%d];\n",lut_startAddrBnGroups[idxBnGroup]);
fprintf(fd," p_llrProcBuf = (__m128i*) &llrProcBuf [%d];\n",lut_startAddrBnGroupsLlr[idxBnGroup]);
fprintf(fd," p_llrProcBuf256 = (__m256i*) &llrProcBuf [%d];\n",lut_startAddrBnGroupsLlr[idxBnGroup]);
fprintf(fd," p_llrRes = (__m256i*) &llrRes [%d];\n",lut_startAddrBnGroupsLlr[idxBnGroup]);
// Loop over BNs
fprintf(fd," for (int i=0,j=0;i<M;i++,j+=2) {\n");
fprintf(fd," p_bnProcBufRes[i] = p_llrProcBuf256[i];\n");
// First 16 LLRs of first CN
fprintf(fd," ymm0 = _mm256_cvtepi8_epi16(p_bnProcBuf [j]);\n");
fprintf(fd," ymm1 = _mm256_cvtepi8_epi16(p_llrProcBuf[j]);\n");
fprintf(fd," ymmRes0 = _mm256_adds_epi16(ymm0, ymm1);\n");
// Loop over CNs
/*for (k=1; k<1; k++)
{
fprintf(fd," ymm0 = _mm256_cvtepi8_epi16(p_bnProcBuf[%d + j]);\n", k*cnOffsetInGroup);
fprintf(fd," ymmRes0 = _mm256_adds_epi16(ymmRes0, ymm0);\n");
fprintf(fd," ymm1 = _mm256_cvtepi8_epi16(p_bnProcBuf[%d + j +1]);\n", k*cnOffsetInGroup);
fprintf(fd, " ymmRes1 = _mm256_adds_epi16(ymmRes1, ymm1); \n");
}
*/
// Add LLR from receiver input
fprintf(fd," ymm0 = _mm256_cvtepi8_epi16(p_bnProcBuf[j+1]);\n");
fprintf(fd," ymmRes0 = _mm256_adds_epi16(ymmRes0, ymm0);\n");
fprintf(fd," ymm1 = _mm256_cvtepi8_epi16(p_llrProcBuf[j +1 ]);\n");
fprintf(fd," ymmRes1 = _mm256_adds_epi16(ymmRes1, ymm1);\n");
// Second 16 LLRs of first CN
fprintf(fd," ymm0 = _mm256_cvtepi8_epi16(p_bnProcBuf[j + 1 ]);\n");
fprintf(fd," ymm1 = _mm256_cvtepi8_epi16(p_llrProcBuf[j + 1 ]);\n");
fprintf(fd," ymmRes1 = _mm256_adds_epi16(ymm0, ymm1);\n");
// Pack results back to epi8
fprintf(fd," ymm0 = _mm256_packs_epi16(ymmRes0, ymmRes1);\n");
......@@ -122,8 +113,8 @@ fprintf(fd, "// Process group with 1 CNs \n");
fprintf(fd,"}\n");
}
// =====================================================================
//}
*/ // =====================================================================
// Process group with 2 CNs
......@@ -150,8 +141,8 @@ fprintf(fd, "// Process group with 2 CNs \n");
// Loop over BNs
fprintf(fd," for (int i=0,j=0;i<M;i++,j+=2) {\n");
// First 16 LLRs of first CN
fprintf(fd," ymm0 = _mm256_cvtepi8_epi16(p_bnProcBuf [j]);\n");
fprintf(fd," ymm1 = _mm256_cvtepi8_epi16(p_bnProcBuf[j + 1]);\n");
fprintf(fd," ymmRes0 = _mm256_cvtepi8_epi16(p_bnProcBuf [j]);\n");
fprintf(fd," ymmRes1 = _mm256_cvtepi8_epi16(p_bnProcBuf[j + 1]);\n");
// Loop over CNs
for (k=1; k<2; k++)
......@@ -1349,7 +1340,7 @@ fprintf(fd, "// Process group with <23 CNs \n");
fprintf(fd," ymm1 = _mm256_cvtepi8_epi16(p_bnProcBuf[%d + j +1]);\n", k*cnOffsetInGroup);
fprintf(fd," ymmRes1 = _mm256_adds_epi16(ymmRes1, ymm1); \n");
}
}
// Add LLR from receiver input
fprintf(fd," ymm0 = _mm256_cvtepi8_epi16(p_llrProcBuf[j]);\n");
......@@ -1771,4 +1762,3 @@ fprintf(fd, "// Process group with 30 CNs \n");
#include <stdint.h>
#include <immintrin.h>
#include "../../nrLDPCdecoder_defs.h"
......@@ -148,8 +144,8 @@ fprintf(fd, "// Process group with 2 CNs \n");
// Loop over BNs
fprintf(fd," for (int i=0,j=0;i<M;i++,j+=2) {\n");
// First 16 LLRs of first CN
fprintf(fd," zmm0 = _mm512_cvtepi8_epi16(p_bnProcBuf [j]);\n");
fprintf(fd," zmm1 = _mm512_cvtepi8_epi16(p_bnProcBuf[j + 1]);\n");
fprintf(fd," zmmRes0 = _mm512_cvtepi8_epi16(p_bnProcBuf [j]);\n");
fprintf(fd," zmmRes1 = _mm512_cvtepi8_epi16(p_bnProcBuf[j + 1]);\n");
// Loop over CNs
for (k=1; k<2; k++)
......
......@@ -146,8 +146,8 @@ fprintf(fd, "// Process group with 2 CNs \n");
// Loop over BNs
fprintf(fd," for (int i=0,j=0;i<M;i++,j+=2) {\n");
// First 16 LLRs of first CN
fprintf(fd," zmm0 = _mm512_cvtepi8_epi16(p_bnProcBuf [j]);\n");
fprintf(fd," zmm1 = _mm512_cvtepi8_epi16(p_bnProcBuf[j + 1]);\n");
fprintf(fd," zmmRes0 = _mm512_cvtepi8_epi16(p_bnProcBuf [j]);\n");
fprintf(fd," zmmRes1 = _mm512_cvtepi8_epi16(p_bnProcBuf[j + 1]);\n");
// Loop over CNs
for (k=1; k<2; k++)
......
static inline void nrLDPC_bnProc_BG1_R89_AVX2(int8_t* bnProcBuf,int8_t* bnProcBufRes, int8_t* llrRes, uint16_t Z ) {
__m256i* p_bnProcBuf;
__m256i* p_bnProcBufRes;
__m256i* p_llrRes;
__m256i* p_res;
uint32_t M, i;
// Process group with 2 CNs
M = (3*Z + 31)>>5;
p_bnProcBuf = (__m256i*) &bnProcBuf [384];
p_bnProcBufRes = (__m256i*) &bnProcBufRes [384];
p_res = &p_bnProcBufRes[0];
p_llrRes = (__m256i*) &llrRes [384];
for (i=0;i<M;i++) {
p_res[i] = _mm256_subs_epi8(p_llrRes[i], p_bnProcBuf[0 + i]);
((__m256i*)bnProcBufRes)[12 + i ] = _mm256_subs_epi8(((__m256i*)llrRes)[12 + i ], ((__m256i*) bnProcBuf)[12 + i]);
}
p_res = &p_bnProcBufRes[36];
p_llrRes = (__m256i*) &llrRes [384];
for (i=0;i<M;i++) {
p_res[i] = _mm256_subs_epi8(p_llrRes[i], p_bnProcBuf[36 + i]);
((__m256i*)bnProcBufRes)[48 + i ] = _mm256_subs_epi8(((__m256i*)llrRes)[12 + i ], ((__m256i*) bnProcBuf)[48 + i]);
}
// Process group with 3 CNs
M = (21*Z + 31)>>5;
p_bnProcBuf = (__m256i*) &bnProcBuf [2688];
p_bnProcBufRes = (__m256i*) &bnProcBufRes [2688];
p_res = &p_bnProcBufRes[0];
p_llrRes = (__m256i*) &llrRes [1536];
for (i=0;i<M;i++) {
p_res[i] = _mm256_subs_epi8(p_llrRes[i], p_bnProcBuf[0 + i]);
((__m256i*)bnProcBufRes)[84 + i ] = _mm256_subs_epi8(((__m256i*)llrRes)[48 + i ], ((__m256i*) bnProcBuf)[84 + i]);
}
p_res = &p_bnProcBufRes[252];
p_llrRes = (__m256i*) &llrRes [1536];
for (i=0;i<M;i++) {
p_res[i] = _mm256_subs_epi8(p_llrRes[i], p_bnProcBuf[252 + i]);
((__m256i*)bnProcBufRes)[336 + i ] = _mm256_subs_epi8(((__m256i*)llrRes)[48 + i ], ((__m256i*) bnProcBuf)[336 + i]);
}
p_res = &p_bnProcBufRes[504];
p_llrRes = (__m256i*) &llrRes [1536];
for (i=0;i<M;i++) {
p_res[i] = _mm256_subs_epi8(p_llrRes[i], p_bnProcBuf[504 + i]);
((__m256i*)bnProcBufRes)[588 + i ] = _mm256_subs_epi8(((__m256i*)llrRes)[48 + i ], ((__m256i*) bnProcBuf)[588 + i]);
}
// Process group with 4 CNs
M = (1*Z + 31)>>5;
p_bnProcBuf = (__m256i*) &bnProcBuf [26880];
p_bnProcBufRes = (__m256i*) &bnProcBufRes [26880];
p_res = &p_bnProcBufRes[0];
p_llrRes = (__m256i*) &llrRes [9600];
for (i=0;i<M;i++) {
p_res[i] = _mm256_subs_epi8(p_llrRes[i], p_bnProcBuf[0 + i]);
((__m256i*)bnProcBufRes)[840 + i ] = _mm256_subs_epi8(((__m256i*)llrRes)[300 + i ], ((__m256i*) bnProcBuf)[840 + i]);
}
p_res = &p_bnProcBufRes[12];
p_llrRes = (__m256i*) &llrRes [9600];
for (i=0;i<M;i++) {
p_res[i] = _mm256_subs_epi8(p_llrRes[i], p_bnProcBuf[12 + i]);
((__m256i*)bnProcBufRes)[852 + i ] = _mm256_subs_epi8(((__m256i*)llrRes)[300 + i ], ((__m256i*) bnProcBuf)[852 + i]);
}
p_res = &p_bnProcBufRes[24];
p_llrRes = (__m256i*) &llrRes [9600];
for (i=0;i<M;i++) {
p_res[i] = _mm256_subs_epi8(p_llrRes[i], p_bnProcBuf[24 + i]);
((__m256i*)bnProcBufRes)[864 + i ] = _mm256_subs_epi8(((__m256i*)llrRes)[300 + i ], ((__m256i*) bnProcBuf)[864 + i]);
}
p_res = &p_bnProcBufRes[36];
p_llrRes = (__m256i*) &llrRes [9600];
for (i=0;i<M;i++) {
p_res[i] = _mm256_subs_epi8(p_llrRes[i], p_bnProcBuf[36 + i]);
((__m256i*)bnProcBufRes)[876 + i ] = _mm256_subs_epi8(((__m256i*)llrRes)[300 + i ], ((__m256i*) bnProcBuf)[876 + i]);
}
// Process group with 5 CNs
M = (1*Z + 31)>>5;
p_bnProcBuf = (__m256i*) &bnProcBuf [28416];
p_bnProcBufRes = (__m256i*) &bnProcBufRes [28416];
p_res = &p_bnProcBufRes[0];
p_llrRes = (__m256i*) &llrRes [9984];
for (i=0;i<M;i++) {
p_res[i] = _mm256_subs_epi8(p_llrRes[i], p_bnProcBuf[0 + i]);
((__m256i*)bnProcBufRes)[888 + i ] = _mm256_subs_epi8(((__m256i*)llrRes)[312 + i ], ((__m256i*) bnProcBuf)[888 + i]);
}
p_res = &p_bnProcBufRes[12];
p_llrRes = (__m256i*) &llrRes [9984];
for (i=0;i<M;i++) {
p_res[i] = _mm256_subs_epi8(p_llrRes[i], p_bnProcBuf[12 + i]);
((__m256i*)bnProcBufRes)[900 + i ] = _mm256_subs_epi8(((__m256i*)llrRes)[312 + i ], ((__m256i*) bnProcBuf)[900 + i]);
}
p_res = &p_bnProcBufRes[24];
p_llrRes = (__m256i*) &llrRes [9984];
for (i=0;i<M;i++) {
p_res[i] = _mm256_subs_epi8(p_llrRes[i], p_bnProcBuf[24 + i]);
((__m256i*)bnProcBufRes)[912 + i ] = _mm256_subs_epi8(((__m256i*)llrRes)[312 + i ], ((__m256i*) bnProcBuf)[912 + i]);
}
p_res = &p_bnProcBufRes[36];
p_llrRes = (__m256i*) &llrRes [9984];
for (i=0;i<M;i++) {
p_res[i] = _mm256_subs_epi8(p_llrRes[i], p_bnProcBuf[36 + i]);
((__m256i*)bnProcBufRes)[924 + i ] = _mm256_subs_epi8(((__m256i*)llrRes)[312 + i ], ((__m256i*) bnProcBuf)[924 + i]);
}
p_res = &p_bnProcBufRes[48];
p_llrRes = (__m256i*) &llrRes [9984];
for (i=0;i<M;i++) {
p_res[i] = _mm256_subs_epi8(p_llrRes[i], p_bnProcBuf[48 + i]);
((__m256i*)bnProcBufRes)[936 + i ] = _mm256_subs_epi8(((__m256i*)llrRes)[312 + i ], ((__m256i*) bnProcBuf)[936 + i]);
}
// Process group with 6 CNs
// Process group with 7 CNs
......
#include <stdint.h>
#include <immintrin.h>
void nrLDPC_bnProc_BG2_R23_AVX2(int8_t* bnProcBuf,int8_t* bnProcBufRes, int8_t* llrRes, uint16_t Z ) {
__m256i* p_bnProcBuf;
__m256i* p_bnProcBufRes;
......@@ -8,123 +6,73 @@ void nrLDPC_bnProc_BG2_R23_AVX2(int8_t* bnProcBuf,int8_t* bnProcBufRes, int8_t*
uint32_t M, i;
// Process group with 2 CNs
M = (3*Z + 31)>>5;
p_bnProcBuf = (__m256i*) &bnProcBuf [1152];
p_bnProcBufRes = (__m256i*) &bnProcBufRes [1152];
p_res = &p_bnProcBufRes[0];
p_llrRes = (__m256i*) &llrRes [1152];
for (i=0;i<M;i++) {
p_res[i] = _mm256_subs_epi8(p_llrRes[i], p_bnProcBuf[0 + i]);
((__m256i*)bnProcBufRes)[36 + i ] = _mm256_subs_epi8(((__m256i*)llrRes)[36 + i ], ((__m256i*) bnProcBuf)[36 + i]);
}
p_res = &p_bnProcBufRes[36];
p_llrRes = (__m256i*) &llrRes [1152];
for (i=0;i<M;i++) {
p_res[i] = _mm256_subs_epi8(p_llrRes[i], p_bnProcBuf[36 + i]);
((__m256i*)bnProcBufRes)[72 + i ] = _mm256_subs_epi8(((__m256i*)llrRes)[36 + i ], ((__m256i*) bnProcBuf)[72 + i]);
}
// Process group with 3 CNs
M = (5*Z + 31)>>5;
p_bnProcBuf = (__m256i*) &bnProcBuf [3456];
p_bnProcBufRes = (__m256i*) &bnProcBufRes [3456];
p_res = &p_bnProcBufRes[0];
p_llrRes = (__m256i*) &llrRes [2304];
for (i=0;i<M;i++) {
p_res[i] = _mm256_subs_epi8(p_llrRes[i], p_bnProcBuf[0 + i]);
((__m256i*)bnProcBufRes)[108 + i ] = _mm256_subs_epi8(((__m256i*)llrRes)[72 + i ], ((__m256i*) bnProcBuf)[108 + i]);
}
p_res = &p_bnProcBufRes[60];
p_llrRes = (__m256i*) &llrRes [2304];
for (i=0;i<M;i++) {
p_res[i] = _mm256_subs_epi8(p_llrRes[i], p_bnProcBuf[60 + i]);
((__m256i*)bnProcBufRes)[168 + i ] = _mm256_subs_epi8(((__m256i*)llrRes)[72 + i ], ((__m256i*) bnProcBuf)[168 + i]);
}
p_res = &p_bnProcBufRes[120];
p_llrRes = (__m256i*) &llrRes [2304];
for (i=0;i<M;i++) {
p_res[i] = _mm256_subs_epi8(p_llrRes[i], p_bnProcBuf[120 + i]);
((__m256i*)bnProcBufRes)[228 + i ] = _mm256_subs_epi8(((__m256i*)llrRes)[72 + i ], ((__m256i*) bnProcBuf)[228 + i]);
}
// Process group with 4 CNs
M = (3*Z + 31)>>5;
p_bnProcBuf = (__m256i*) &bnProcBuf [9216];
p_bnProcBufRes = (__m256i*) &bnProcBufRes [9216];
p_res = &p_bnProcBufRes[0];
p_llrRes = (__m256i*) &llrRes [4224];
for (i=0;i<M;i++) {
p_res[i] = _mm256_subs_epi8(p_llrRes[i], p_bnProcBuf[0 + i]);
((__m256i*)bnProcBufRes)[288 + i ] = _mm256_subs_epi8(((__m256i*)llrRes)[132 + i ], ((__m256i*) bnProcBuf)[288 + i]);
}
p_res = &p_bnProcBufRes[36];
p_llrRes = (__m256i*) &llrRes [4224];
for (i=0;i<M;i++) {
p_res[i] = _mm256_subs_epi8(p_llrRes[i], p_bnProcBuf[36 + i]);
((__m256i*)bnProcBufRes)[324 + i ] = _mm256_subs_epi8(((__m256i*)llrRes)[132 + i ], ((__m256i*) bnProcBuf)[324 + i]);
}
p_res = &p_bnProcBufRes[72];
p_llrRes = (__m256i*) &llrRes [4224];
for (i=0;i<M;i++) {
p_res[i] = _mm256_subs_epi8(p_llrRes[i], p_bnProcBuf[72 + i]);
((__m256i*)bnProcBufRes)[360 + i ] = _mm256_subs_epi8(((__m256i*)llrRes)[132 + i ], ((__m256i*) bnProcBuf)[360 + i]);
}
p_res = &p_bnProcBufRes[108];
p_llrRes = (__m256i*) &llrRes [4224];
for (i=0;i<M;i++) {
p_res[i] = _mm256_subs_epi8(p_llrRes[i], p_bnProcBuf[108 + i]);
((__m256i*)bnProcBufRes)[396 + i ] = _mm256_subs_epi8(((__m256i*)llrRes)[132 + i ], ((__m256i*) bnProcBuf)[396 + i]);
}
// Process group with 5 CNs
M = (2*Z + 31)>>5;
p_bnProcBuf = (__m256i*) &bnProcBuf [13824];
p_bnProcBufRes = (__m256i*) &bnProcBufRes [13824];
p_res = &p_bnProcBufRes[0];
p_llrRes = (__m256i*) &llrRes [5376];
for (i=0;i<M;i++) {
p_res[i] = _mm256_subs_epi8(p_llrRes[i], p_bnProcBuf[0 + i]);
((__m256i*)bnProcBufRes)[432 + i ] = _mm256_subs_epi8(((__m256i*)llrRes)[168 + i ], ((__m256i*) bnProcBuf)[432 + i]);
}
p_res = &p_bnProcBufRes[24];
p_llrRes = (__m256i*) &llrRes [5376];
for (i=0;i<M;i++) {
p_res[i] = _mm256_subs_epi8(p_llrRes[i], p_bnProcBuf[24 + i]);
((__m256i*)bnProcBufRes)[456 + i ] = _mm256_subs_epi8(((__m256i*)llrRes)[168 + i ], ((__m256i*) bnProcBuf)[456 + i]);
}
p_res = &p_bnProcBufRes[48];
p_llrRes = (__m256i*) &llrRes [5376];
for (i=0;i<M;i++) {
p_res[i] = _mm256_subs_epi8(p_llrRes[i], p_bnProcBuf[48 + i]);
((__m256i*)bnProcBufRes)[480 + i ] = _mm256_subs_epi8(((__m256i*)llrRes)[168 + i ], ((__m256i*) bnProcBuf)[480 + i]);
}
p_res = &p_bnProcBufRes[72];
p_llrRes = (__m256i*) &llrRes [5376];
for (i=0;i<M;i++) {
p_res[i] = _mm256_subs_epi8(p_llrRes[i], p_bnProcBuf[72 + i]);
((__m256i*)bnProcBufRes)[504 + i ] = _mm256_subs_epi8(((__m256i*)llrRes)[168 + i ], ((__m256i*) bnProcBuf)[504 + i]);
}
p_res = &p_bnProcBufRes[96];
p_llrRes = (__m256i*) &llrRes [5376];
for (i=0;i<M;i++) {
p_res[i] = _mm256_subs_epi8(p_llrRes[i], p_bnProcBuf[96 + i]);
((__m256i*)bnProcBufRes)[528 + i ] = _mm256_subs_epi8(((__m256i*)llrRes)[168 + i ], ((__m256i*) bnProcBuf)[528 + i]);
}
// Process group with 6 CNs
M = (1*Z + 31)>>5;
p_bnProcBuf = (__m256i*) &bnProcBuf [17664];
p_bnProcBufRes = (__m256i*) &bnProcBufRes [17664];
p_res = &p_bnProcBufRes[0];
p_llrRes = (__m256i*) &llrRes [6144];
for (i=0;i<M;i++) {
p_res[i] = _mm256_subs_epi8(p_llrRes[i], p_bnProcBuf[0 + i]);
((__m256i*)bnProcBufRes)[552 + i ] = _mm256_subs_epi8(((__m256i*)llrRes)[192 + i ], ((__m256i*) bnProcBuf)[552 + i]);
}
p_res = &p_bnProcBufRes[12];
p_llrRes = (__m256i*) &llrRes [6144];
for (i=0;i<M;i++) {
p_res[i] = _mm256_subs_epi8(p_llrRes[i], p_bnProcBuf[12 + i]);
((__m256i*)bnProcBufRes)[564 + i ] = _mm256_subs_epi8(((__m256i*)llrRes)[192 + i ], ((__m256i*) bnProcBuf)[564 + i]);
}
p_res = &p_bnProcBufRes[24];
p_llrRes = (__m256i*) &llrRes [6144];
for (i=0;i<M;i++) {
p_res[i] = _mm256_subs_epi8(p_llrRes[i], p_bnProcBuf[24 + i]);
((__m256i*)bnProcBufRes)[576 + i ] = _mm256_subs_epi8(((__m256i*)llrRes)[192 + i ], ((__m256i*) bnProcBuf)[576 + i]);
}
p_res = &p_bnProcBufRes[36];
p_llrRes = (__m256i*) &llrRes [6144];
for (i=0;i<M;i++) {
p_res[i] = _mm256_subs_epi8(p_llrRes[i], p_bnProcBuf[36 + i]);
((__m256i*)bnProcBufRes)[588 + i ] = _mm256_subs_epi8(((__m256i*)llrRes)[192 + i ], ((__m256i*) bnProcBuf)[588 + i]);
}
p_res = &p_bnProcBufRes[48];
p_llrRes = (__m256i*) &llrRes [6144];
for (i=0;i<M;i++) {
p_res[i] = _mm256_subs_epi8(p_llrRes[i], p_bnProcBuf[48 + i]);
((__m256i*)bnProcBufRes)[600 + i ] = _mm256_subs_epi8(((__m256i*)llrRes)[192 + i ], ((__m256i*) bnProcBuf)[600 + i]);
}
p_res = &p_bnProcBufRes[60];
p_llrRes = (__m256i*) &llrRes [6144];
for (i=0;i<M;i++) {
p_res[i] = _mm256_subs_epi8(p_llrRes[i], p_bnProcBuf[60 + i]);
((__m256i*)bnProcBufRes)[612 + i ] = _mm256_subs_epi8(((__m256i*)llrRes)[192 + i ], ((__m256i*) bnProcBuf)[612 + i]);
}
// Process group with 7 CNs
// Process group with 8 CNs
......
static inline void nrLDPC_bnProcPc_BG1_R13_AVX2(int8_t* bnProcBuf,int8_t* llrRes , int8_t* llrProcBuf, uint16_t Z ) {
static inline void nrLDPC_bnProcPc_BG1_R13_AVX2(int8_t* bnProcBuf,int8_t* bnProcBufRes,int8_t* llrRes , int8_t* llrProcBuf, uint16_t Z ) {
__m256i ymm0, ymm1, ymmRes0, ymmRes1;
__m128i* p_bnProcBuf;
__m128i* p_llrProcBuf;
......
static inline void nrLDPC_bnProcPc_BG1_R23_AVX2(int8_t* bnProcBuf,int8_t* llrRes , int8_t* llrProcBuf, uint16_t Z ) {
static inline void nrLDPC_bnProcPc_BG1_R23_AVX2(int8_t* bnProcBuf,int8_t* bnProcBufRes,int8_t* llrRes , int8_t* llrProcBuf, uint16_t Z ) {
__m256i ymm0, ymm1, ymmRes0, ymmRes1;
__m128i* p_bnProcBuf;
__m128i* p_llrProcBuf;
......@@ -11,8 +11,8 @@ static inline void nrLDPC_bnProcPc_BG1_R23_AVX2(int8_t* bnProcBuf,int8_t* llrRes
p_llrProcBuf = (__m128i*) &llrProcBuf [3456];
p_llrRes = (__m256i*) &llrRes [3456];
for (int i=0,j=0;i<M;i++,j+=2) {
ymm0 = _mm256_cvtepi8_epi16(p_bnProcBuf [j]);
ymm1 = _mm256_cvtepi8_epi16(p_bnProcBuf[j + 1]);
ymmRes0 = _mm256_cvtepi8_epi16(p_bnProcBuf [j]);
ymmRes1 = _mm256_cvtepi8_epi16(p_bnProcBuf[j + 1]);
ymm0 = _mm256_cvtepi8_epi16(p_bnProcBuf[24 + j]);
ymmRes0 = _mm256_adds_epi16(ymmRes0, ymm0);
ymm1 = _mm256_cvtepi8_epi16(p_bnProcBuf[24 + j +1]);
......
static inline void nrLDPC_bnProcPc_BG1_R89_AVX2(int8_t* bnProcBuf,int8_t* llrRes , int8_t* llrProcBuf, uint16_t Z ) {
static inline void nrLDPC_bnProcPc_BG1_R89_AVX2(int8_t* bnProcBuf,int8_t* bnProcBufRes,int8_t* llrRes , int8_t* llrProcBuf, uint16_t Z ) {
__m256i ymm0, ymm1, ymmRes0, ymmRes1;
__m128i* p_bnProcBuf;
__m128i* p_llrProcBuf;
......@@ -11,8 +11,8 @@ static inline void nrLDPC_bnProcPc_BG1_R89_AVX2(int8_t* bnProcBuf,int8_t* llrRes
p_llrProcBuf = (__m128i*) &llrProcBuf [384];
p_llrRes = (__m256i*) &llrRes [384];
for (int i=0,j=0;i<M;i++,j+=2) {
ymm0 = _mm256_cvtepi8_epi16(p_bnProcBuf [j]);
ymm1 = _mm256_cvtepi8_epi16(p_bnProcBuf[j + 1]);
ymmRes0 = _mm256_cvtepi8_epi16(p_bnProcBuf [j]);
ymmRes1 = _mm256_cvtepi8_epi16(p_bnProcBuf[j + 1]);
ymm0 = _mm256_cvtepi8_epi16(p_bnProcBuf[72 + j]);
ymmRes0 = _mm256_adds_epi16(ymmRes0, ymm0);
ymm1 = _mm256_cvtepi8_epi16(p_bnProcBuf[72 + j +1]);
......
static inline void nrLDPC_bnProcPc_BG2_R13_AVX2(int8_t* bnProcBuf,int8_t* llrRes , int8_t* llrProcBuf, uint16_t Z ) {
static inline void nrLDPC_bnProcPc_BG2_R13_AVX2(int8_t* bnProcBuf,int8_t* bnProcBufRes,int8_t* llrRes , int8_t* llrProcBuf, uint16_t Z ) {
__m256i ymm0, ymm1, ymmRes0, ymmRes1;
__m128i* p_bnProcBuf;
__m128i* p_llrProcBuf;
__m256i* p_llrRes;
uint32_t M ;
// Process group with 1 CNs
M = (18*Z + 31)>>5;
p_bnProcBuf = (__m128i*) &bnProcBuf [0];
p_llrProcBuf = (__m128i*) &llrProcBuf [0];
p_llrRes = (__m256i*) &llrRes [0];
for (int i=0,j=0;i<M;i++,j+=2) {
ymm0 = _mm256_cvtepi8_epi16(p_bnProcBuf [j]);
ymm1 = _mm256_cvtepi8_epi16(p_llrProcBuf[j]);
ymm0 = _mm256_cvtepi8_epi16(p_bnProcBuf[j+1]);
ymmRes0 = _mm256_adds_epi16(ymmRes0, ymm0);
ymm1 = _mm256_cvtepi8_epi16(p_llrProcBuf[j +1 ]);
ymmRes1 = _mm256_adds_epi16(ymmRes1, ymm1);
ymm0 = _mm256_packs_epi16(ymmRes0, ymmRes1);
p_llrRes[i] = _mm256_permute4x64_epi64(ymm0, 0xD8);
}
// Process group with 2 CNs
M = (1*Z + 31)>>5;
p_bnProcBuf = (__m128i*) &bnProcBuf [6912];
p_llrProcBuf = (__m128i*) &llrProcBuf [6912];
p_llrRes = (__m256i*) &llrRes [6912];
for (int i=0,j=0;i<M;i++,j+=2) {
ymm0 = _mm256_cvtepi8_epi16(p_bnProcBuf [j]);
ymm1 = _mm256_cvtepi8_epi16(p_bnProcBuf[j + 1]);
ymmRes0 = _mm256_cvtepi8_epi16(p_bnProcBuf [j]);
ymmRes1 = _mm256_cvtepi8_epi16(p_bnProcBuf[j + 1]);
ymm0 = _mm256_cvtepi8_epi16(p_bnProcBuf[24 + j]);
ymmRes0 = _mm256_adds_epi16(ymmRes0, ymm0);
ymm1 = _mm256_cvtepi8_epi16(p_bnProcBuf[24 + j +1]);
......
static inline void nrLDPC_bnProcPc_BG2_R15_AVX2(int8_t* bnProcBuf,int8_t* llrRes , int8_t* llrProcBuf, uint16_t Z ) {
static inline void nrLDPC_bnProcPc_BG2_R15_AVX2(int8_t* bnProcBuf,int8_t* bnProcBufRes,int8_t* llrRes , int8_t* llrProcBuf, uint16_t Z ) {
__m256i ymm0, ymm1, ymmRes0, ymmRes1;
__m128i* p_bnProcBuf;
__m128i* p_llrProcBuf;
__m256i* p_llrRes;
uint32_t M ;
// Process group with 1 CNs
M = (38*Z + 31)>>5;
p_bnProcBuf = (__m128i*) &bnProcBuf [0];
p_llrProcBuf = (__m128i*) &llrProcBuf [0];
p_llrRes = (__m256i*) &llrRes [0];
for (int i=0,j=0;i<M;i++,j+=2) {
ymm0 = _mm256_cvtepi8_epi16(p_bnProcBuf [j]);
ymm1 = _mm256_cvtepi8_epi16(p_llrProcBuf[j]);
ymm0 = _mm256_cvtepi8_epi16(p_bnProcBuf[j+1]);
ymmRes0 = _mm256_adds_epi16(ymmRes0, ymm0);
ymm1 = _mm256_cvtepi8_epi16(p_llrProcBuf[j +1 ]);
ymmRes1 = _mm256_adds_epi16(ymmRes1, ymm1);
ymm0 = _mm256_packs_epi16(ymmRes0, ymmRes1);
p_llrRes[i] = _mm256_permute4x64_epi64(ymm0, 0xD8);
}
// Process group with 2 CNs
// Process group with 3 CNs
// Process group with 4 CNs
......
static inline void nrLDPC_bnProcPc_BG2_R23_AVX2(int8_t* bnProcBuf,int8_t* llrRes , int8_t* llrProcBuf, uint16_t Z ) {
static inline void nrLDPC_bnProcPc_BG2_R23_AVX2(int8_t* bnProcBuf,int8_t* bnProcBufRes,int8_t* llrRes , int8_t* llrProcBuf, uint16_t Z ) {
__m256i ymm0, ymm1, ymmRes0, ymmRes1;
__m128i* p_bnProcBuf;
__m128i* p_llrProcBuf;
__m256i* p_llrRes;
uint32_t M ;
// Process group with 1 CNs
M = (3*Z + 31)>>5;
p_bnProcBuf = (__m128i*) &bnProcBuf [0];
p_llrProcBuf = (__m128i*) &llrProcBuf [0];
p_llrRes = (__m256i*) &llrRes [0];
for (int i=0,j=0;i<M;i++,j+=2) {
ymm0 = _mm256_cvtepi8_epi16(p_bnProcBuf [j]);
ymm1 = _mm256_cvtepi8_epi16(p_llrProcBuf[j]);
ymm0 = _mm256_cvtepi8_epi16(p_bnProcBuf[j+1]);
ymmRes0 = _mm256_adds_epi16(ymmRes0, ymm0);
ymm1 = _mm256_cvtepi8_epi16(p_llrProcBuf[j +1 ]);
ymmRes1 = _mm256_adds_epi16(ymmRes1, ymm1);
ymm0 = _mm256_packs_epi16(ymmRes0, ymmRes1);
p_llrRes[i] = _mm256_permute4x64_epi64(ymm0, 0xD8);
}
// Process group with 2 CNs
M = (3*Z + 31)>>5;
p_bnProcBuf = (__m128i*) &bnProcBuf [1152];
p_llrProcBuf = (__m128i*) &llrProcBuf [1152];
p_llrRes = (__m256i*) &llrRes [1152];
for (int i=0,j=0;i<M;i++,j+=2) {
ymm0 = _mm256_cvtepi8_epi16(p_bnProcBuf [j]);
ymm1 = _mm256_cvtepi8_epi16(p_bnProcBuf[j + 1]);
ymmRes0 = _mm256_cvtepi8_epi16(p_bnProcBuf [j]);
ymmRes1 = _mm256_cvtepi8_epi16(p_bnProcBuf[j + 1]);
ymm0 = _mm256_cvtepi8_epi16(p_bnProcBuf[72 + j]);
ymmRes0 = _mm256_adds_epi16(ymmRes0, ymm0);
ymm1 = _mm256_cvtepi8_epi16(p_bnProcBuf[72 + j +1]);
......
static inline void nrLDPC_bnProc_BG1_R89_AVX512(int8_t* bnProcBuf,int8_t* bnProcBufRes, int8_t* llrRes, uint16_t Z ) {
__m512i* p_bnProcBuf;
__m512i* p_bnProcBufRes;
__m512i* p_llrRes;
__m512i* p_res;
uint32_t M, i;
// Process group with 2 CNs
M = (3*Z + 63)>>6;
p_bnProcBuf = (__m512i*) &bnProcBuf [384];
p_bnProcBufRes = (__m512i*) &bnProcBufRes [384];
p_res = &p_bnProcBufRes[0];
p_llrRes = (__m512i*) &llrRes [384];
for (i=0;i<M;i++) {
p_res[i] = _mm512_subs_epi8(p_llrRes[i], p_bnProcBuf[0 + i]);
((__m512i*)bnProcBufRes)[6 + i ] = _mm512_subs_epi8(((__m512i*)llrRes)[6 + i ], ((__m512i*) bnProcBuf)[6 + i]);
}
p_res = &p_bnProcBufRes[18];
p_llrRes = (__m512i*) &llrRes [384];
for (i=0;i<M;i++) {
p_res[i] = _mm512_subs_epi8(p_llrRes[i], p_bnProcBuf[18 + i]);
((__m512i*)bnProcBufRes)[24 + i ] = _mm512_subs_epi8(((__m512i*)llrRes)[6 + i ], ((__m512i*) bnProcBuf)[24 + i]);
}
// Process group with 3 CNs
M = (21*Z + 63)>>6;
p_bnProcBuf = (__m512i*) &bnProcBuf [2688];
p_bnProcBufRes = (__m512i*) &bnProcBufRes [2688];
p_res = &p_bnProcBufRes[0];
p_llrRes = (__m512i*) &llrRes [1536];
for (i=0;i<M;i++) {
p_res[i] = _mm512_subs_epi8(p_llrRes[i], p_bnProcBuf[0 + i]);
((__m512i*)bnProcBufRes)[42 + i ] = _mm512_subs_epi8(((__m512i*)llrRes)[24 + i ], ((__m512i*) bnProcBuf)[42 + i]);
}
p_res = &p_bnProcBufRes[126];
p_llrRes = (__m512i*) &llrRes [1536];
for (i=0;i<M;i++) {
p_res[i] = _mm512_subs_epi8(p_llrRes[i], p_bnProcBuf[126 + i]);
((__m512i*)bnProcBufRes)[168 + i ] = _mm512_subs_epi8(((__m512i*)llrRes)[24 + i ], ((__m512i*) bnProcBuf)[168 + i]);
}
p_res = &p_bnProcBufRes[252];
p_llrRes = (__m512i*) &llrRes [1536];
for (i=0;i<M;i++) {
p_res[i] = _mm512_subs_epi8(p_llrRes[i], p_bnProcBuf[252 + i]);
((__m512i*)bnProcBufRes)[294 + i ] = _mm512_subs_epi8(((__m512i*)llrRes)[24 + i ], ((__m512i*) bnProcBuf)[294 + i]);
}
// Process group with 4 CNs
M = (1*Z + 63)>>6;
p_bnProcBuf = (__m512i*) &bnProcBuf [26880];
p_bnProcBufRes = (__m512i*) &bnProcBufRes [26880];
p_res = &p_bnProcBufRes[0];
p_llrRes = (__m512i*) &llrRes [9600];
for (i=0;i<M;i++) {
p_res[i] = _mm512_subs_epi8(p_llrRes[i], p_bnProcBuf[0 + i]);
((__m512i*)bnProcBufRes)[420 + i ] = _mm512_subs_epi8(((__m512i*)llrRes)[150 + i ], ((__m512i*) bnProcBuf)[420 + i]);
}
p_res = &p_bnProcBufRes[6];
p_llrRes = (__m512i*) &llrRes [9600];
for (i=0;i<M;i++) {
p_res[i] = _mm512_subs_epi8(p_llrRes[i], p_bnProcBuf[6 + i]);
((__m512i*)bnProcBufRes)[426 + i ] = _mm512_subs_epi8(((__m512i*)llrRes)[150 + i ], ((__m512i*) bnProcBuf)[426 + i]);
}
p_res = &p_bnProcBufRes[12];
p_llrRes = (__m512i*) &llrRes [9600];
for (i=0;i<M;i++) {
p_res[i] = _mm512_subs_epi8(p_llrRes[i], p_bnProcBuf[12 + i]);
((__m512i*)bnProcBufRes)[432 + i ] = _mm512_subs_epi8(((__m512i*)llrRes)[150 + i ], ((__m512i*) bnProcBuf)[432 + i]);
}
p_res = &p_bnProcBufRes[18];
p_llrRes = (__m512i*) &llrRes [9600];
for (i=0;i<M;i++) {
p_res[i] = _mm512_subs_epi8(p_llrRes[i], p_bnProcBuf[18 + i]);
((__m512i*)bnProcBufRes)[438 + i ] = _mm512_subs_epi8(((__m512i*)llrRes)[150 + i ], ((__m512i*) bnProcBuf)[438 + i]);
}
// Process group with 5 CNs
M = (1*Z + 63)>>6;
p_bnProcBuf = (__m512i*) &bnProcBuf [28416];
p_bnProcBufRes = (__m512i*) &bnProcBufRes [28416];
p_res = &p_bnProcBufRes[0];
p_llrRes = (__m512i*) &llrRes [9984];
for (i=0;i<M;i++) {
p_res[i] = _mm512_subs_epi8(p_llrRes[i], p_bnProcBuf[0 + i]);
((__m512i*)bnProcBufRes)[444 + i ] = _mm512_subs_epi8(((__m512i*)llrRes)[156 + i ], ((__m512i*) bnProcBuf)[444 + i]);
}
p_res = &p_bnProcBufRes[6];
p_llrRes = (__m512i*) &llrRes [9984];
for (i=0;i<M;i++) {
p_res[i] = _mm512_subs_epi8(p_llrRes[i], p_bnProcBuf[6 + i]);
((__m512i*)bnProcBufRes)[450 + i ] = _mm512_subs_epi8(((__m512i*)llrRes)[156 + i ], ((__m512i*) bnProcBuf)[450 + i]);
}
p_res = &p_bnProcBufRes[12];
p_llrRes = (__m512i*) &llrRes [9984];
for (i=0;i<M;i++) {
p_res[i] = _mm512_subs_epi8(p_llrRes[i], p_bnProcBuf[12 + i]);
((__m512i*)bnProcBufRes)[456 + i ] = _mm512_subs_epi8(((__m512i*)llrRes)[156 + i ], ((__m512i*) bnProcBuf)[456 + i]);
}
p_res = &p_bnProcBufRes[18];
p_llrRes = (__m512i*) &llrRes [9984];
for (i=0;i<M;i++) {
p_res[i] = _mm512_subs_epi8(p_llrRes[i], p_bnProcBuf[18 + i]);
((__m512i*)bnProcBufRes)[462 + i ] = _mm512_subs_epi8(((__m512i*)llrRes)[156 + i ], ((__m512i*) bnProcBuf)[462 + i]);
}
p_res = &p_bnProcBufRes[24];
p_llrRes = (__m512i*) &llrRes [9984];
for (i=0;i<M;i++) {
p_res[i] = _mm512_subs_epi8(p_llrRes[i], p_bnProcBuf[24 + i]);
((__m512i*)bnProcBufRes)[468 + i ] = _mm512_subs_epi8(((__m512i*)llrRes)[156 + i ], ((__m512i*) bnProcBuf)[468 + i]);
}
// Process group with 6 CNs
// Process group with 7 CNs
......
Markdown is supported
0%
or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment