Commit 0f704894 authored by Sy's avatar Sy

degradation of performance in BLER corrected | bnProc & bnProcPc unrolled

parent 5e5bc1a9
/* /*
* Licensed to the OpenAirInterface (OAI) Software Alliance under one or more * Licensed to the OpenAirInterface (OAI) Software Alliance under one or more
* contributor license agreements. See the NOTICE file distributed with * contributor license agreements. See the NOTICE file distributed with
...@@ -22,7 +21,7 @@ ...@@ -22,7 +21,7 @@
/*!\file nrLDPC_mPass.h /*!\file nrLDPC_mPass.h
* \brief Defines the functions for message passing * \brief Defines the functions for message passing
* *
*/ */
#ifndef __NR_LDPC_MPASS__H__ #ifndef __NR_LDPC_MPASS__H__
#define __NR_LDPC_MPASS__H__ #define __NR_LDPC_MPASS__H__
...@@ -42,7 +41,7 @@ ...@@ -42,7 +41,7 @@
\param Z Lifting size \param Z Lifting size
\param cshift Circular shift \param cshift Circular shift
*/ */
//more faster memcpy by using "rep movsb", which on modern processors is highly optimized //more faster memcpy by using "rep movsb", which on modern processors is highly optimized
void *memcpy1(void *dst, const void *src, size_t n) void *memcpy1(void *dst, const void *src, size_t n)
{ {
...@@ -177,8 +176,7 @@ static inline void nrLDPC_llr2CnProcBuf_BG1(t_nrLDPC_lut* p_lut, int8_t* llr, t_ ...@@ -177,8 +176,7 @@ static inline void nrLDPC_llr2CnProcBuf_BG1(t_nrLDPC_lut* p_lut, int8_t* llr, t_
bitOffsetInGroup = lut_numCnInCnGroups_BG1_R13[0]*NR_LDPC_ZMAX; bitOffsetInGroup = lut_numCnInCnGroups_BG1_R13[0]*NR_LDPC_ZMAX;
// #pragma omp simd
// #pragma omp parallel for schedule(dynamic)
for (j=0; j<3; j++) for (j=0; j<3; j++)
{ {
...@@ -1026,7 +1024,7 @@ static inline void nrLDPC_bn2cnProcBuf_BG1(t_nrLDPC_lut* p_lut, t_nrLDPC_procBuf ...@@ -1026,7 +1024,7 @@ static inline void nrLDPC_bn2cnProcBuf_BG1(t_nrLDPC_lut* p_lut, t_nrLDPC_procBuf
// CN group with 3 BNs // CN group with 3 BNs
bitOffsetInGroup = lut_numCnInCnGroups_BG1_R13[0]*NR_LDPC_ZMAX; bitOffsetInGroup = lut_numCnInCnGroups_BG1_R13[0]*NR_LDPC_ZMAX;
// #pragma omp simd
for (j=0;j<2; j++) for (j=0;j<2; j++)
{ {
p_cnProcBuf = &cnProcBuf[lut_startAddrCnGroups[0] + j*bitOffsetInGroup]; p_cnProcBuf = &cnProcBuf[lut_startAddrCnGroups[0] + j*bitOffsetInGroup];
...@@ -1037,11 +1035,11 @@ static inline void nrLDPC_bn2cnProcBuf_BG1(t_nrLDPC_lut* p_lut, t_nrLDPC_procBuf ...@@ -1037,11 +1035,11 @@ static inline void nrLDPC_bn2cnProcBuf_BG1(t_nrLDPC_lut* p_lut, t_nrLDPC_procBuf
// CN group with 4 BNs // CN group with 4 BNs
bitOffsetInGroup = lut_numCnInCnGroups_BG1_R13[1]*NR_LDPC_ZMAX; bitOffsetInGroup = lut_numCnInCnGroups_BG1_R13[1]*NR_LDPC_ZMAX;
// #pragma omp simd
for (j=0; j<3; j++) for (j=0; j<3; j++)
{ {
p_cnProcBuf = &cnProcBuf[lut_startAddrCnGroups[1] + j*bitOffsetInGroup]; p_cnProcBuf = &cnProcBuf[lut_startAddrCnGroups[1] + j*bitOffsetInGroup];
// #pragma omp simd
for (i=0; i<lut_numCnInCnGroups[1]; i++) for (i=0; i<lut_numCnInCnGroups[1]; i++)
{ {
idxBn = lut_startAddrBnProcBuf_CNG4[j][i] + lut_bnPosBnProcBuf_CNG4[j][i]*Z; idxBn = lut_startAddrBnProcBuf_CNG4[j][i] + lut_bnPosBnProcBuf_CNG4[j][i]*Z;
...@@ -1053,11 +1051,11 @@ static inline void nrLDPC_bn2cnProcBuf_BG1(t_nrLDPC_lut* p_lut, t_nrLDPC_procBuf ...@@ -1053,11 +1051,11 @@ static inline void nrLDPC_bn2cnProcBuf_BG1(t_nrLDPC_lut* p_lut, t_nrLDPC_procBuf
// CN group with 5 BNs // CN group with 5 BNs
bitOffsetInGroup = lut_numCnInCnGroups_BG1_R13[2]*NR_LDPC_ZMAX; bitOffsetInGroup = lut_numCnInCnGroups_BG1_R13[2]*NR_LDPC_ZMAX;
// #pragma omp simd
for (j=0; j<4; j++) for (j=0; j<4; j++)
{ {
p_cnProcBuf = &cnProcBuf[lut_startAddrCnGroups[2] + j*bitOffsetInGroup]; p_cnProcBuf = &cnProcBuf[lut_startAddrCnGroups[2] + j*bitOffsetInGroup];
// #pragma omp simd
for (i=0; i<lut_numCnInCnGroups[2]; i++) for (i=0; i<lut_numCnInCnGroups[2]; i++)
{ {
idxBn = lut_startAddrBnProcBuf_CNG5[j][i] + lut_bnPosBnProcBuf_CNG5[j][i]*Z; idxBn = lut_startAddrBnProcBuf_CNG5[j][i] + lut_bnPosBnProcBuf_CNG5[j][i]*Z;
...@@ -1070,11 +1068,11 @@ static inline void nrLDPC_bn2cnProcBuf_BG1(t_nrLDPC_lut* p_lut, t_nrLDPC_procBuf ...@@ -1070,11 +1068,11 @@ static inline void nrLDPC_bn2cnProcBuf_BG1(t_nrLDPC_lut* p_lut, t_nrLDPC_procBuf
// CN group with 6 BNs // CN group with 6 BNs
bitOffsetInGroup = lut_numCnInCnGroups_BG1_R13[3]*NR_LDPC_ZMAX; bitOffsetInGroup = lut_numCnInCnGroups_BG1_R13[3]*NR_LDPC_ZMAX;
//#pragma omp simd
for (j=0; j<5; j++) for (j=0; j<5; j++)
{ {
p_cnProcBuf = &cnProcBuf[lut_startAddrCnGroups[3] + j*bitOffsetInGroup]; p_cnProcBuf = &cnProcBuf[lut_startAddrCnGroups[3] + j*bitOffsetInGroup];
// #pragma omp simd
for (i=0; i<lut_numCnInCnGroups[3]; i++) for (i=0; i<lut_numCnInCnGroups[3]; i++)
{ {
idxBn = lut_startAddrBnProcBuf_CNG6[j][i] + lut_bnPosBnProcBuf_CNG6[j][i]*Z; idxBn = lut_startAddrBnProcBuf_CNG6[j][i] + lut_bnPosBnProcBuf_CNG6[j][i]*Z;
...@@ -1087,12 +1085,12 @@ static inline void nrLDPC_bn2cnProcBuf_BG1(t_nrLDPC_lut* p_lut, t_nrLDPC_procBuf ...@@ -1087,12 +1085,12 @@ static inline void nrLDPC_bn2cnProcBuf_BG1(t_nrLDPC_lut* p_lut, t_nrLDPC_procBuf
// CN group with 7 BNs // CN group with 7 BNs
bitOffsetInGroup = lut_numCnInCnGroups_BG1_R13[4]*NR_LDPC_ZMAX; bitOffsetInGroup = lut_numCnInCnGroups_BG1_R13[4]*NR_LDPC_ZMAX;
//#pragma omp simd
for (j=0; j<6; j++) for (j=0; j<6; j++)
{ {
p_cnProcBuf = &cnProcBuf[lut_startAddrCnGroups[4] + j*bitOffsetInGroup]; p_cnProcBuf = &cnProcBuf[lut_startAddrCnGroups[4] + j*bitOffsetInGroup];
// #pragma omp simd
for (i=0; i<lut_numCnInCnGroups[4]; i++) for (i=0; i<lut_numCnInCnGroups[4]; i++)
{ {
idxBn = lut_startAddrBnProcBuf_CNG7[j][i] + lut_bnPosBnProcBuf_CNG7[j][i]*Z; idxBn = lut_startAddrBnProcBuf_CNG7[j][i] + lut_bnPosBnProcBuf_CNG7[j][i]*Z;
...@@ -1105,7 +1103,7 @@ static inline void nrLDPC_bn2cnProcBuf_BG1(t_nrLDPC_lut* p_lut, t_nrLDPC_procBuf ...@@ -1105,7 +1103,7 @@ static inline void nrLDPC_bn2cnProcBuf_BG1(t_nrLDPC_lut* p_lut, t_nrLDPC_procBuf
// CN group with 8 BNs // CN group with 8 BNs
bitOffsetInGroup = lut_numCnInCnGroups_BG1_R13[5]*NR_LDPC_ZMAX; bitOffsetInGroup = lut_numCnInCnGroups_BG1_R13[5]*NR_LDPC_ZMAX;
// #pragma omp simd
for (j=0; j<7; j++) for (j=0; j<7; j++)
{ {
p_cnProcBuf = &cnProcBuf[lut_startAddrCnGroups[5] + j*bitOffsetInGroup]; p_cnProcBuf = &cnProcBuf[lut_startAddrCnGroups[5] + j*bitOffsetInGroup];
...@@ -1121,11 +1119,11 @@ static inline void nrLDPC_bn2cnProcBuf_BG1(t_nrLDPC_lut* p_lut, t_nrLDPC_procBuf ...@@ -1121,11 +1119,11 @@ static inline void nrLDPC_bn2cnProcBuf_BG1(t_nrLDPC_lut* p_lut, t_nrLDPC_procBuf
// CN group with 9 BNs // CN group with 9 BNs
bitOffsetInGroup = lut_numCnInCnGroups_BG1_R13[6]*NR_LDPC_ZMAX; bitOffsetInGroup = lut_numCnInCnGroups_BG1_R13[6]*NR_LDPC_ZMAX;
// #pragma omp simd
for (j=0; j<8; j++) for (j=0; j<8; j++)
{ {
p_cnProcBuf = &cnProcBuf[lut_startAddrCnGroups[6] + j*bitOffsetInGroup]; p_cnProcBuf = &cnProcBuf[lut_startAddrCnGroups[6] + j*bitOffsetInGroup];
// #pragma omp simd
for (i=0; i<lut_numCnInCnGroups[6]; i++) for (i=0; i<lut_numCnInCnGroups[6]; i++)
{ {
idxBn = lut_startAddrBnProcBuf_CNG9[j][i] + lut_bnPosBnProcBuf_CNG9[j][i]*Z; idxBn = lut_startAddrBnProcBuf_CNG9[j][i] + lut_bnPosBnProcBuf_CNG9[j][i]*Z;
...@@ -1138,11 +1136,11 @@ static inline void nrLDPC_bn2cnProcBuf_BG1(t_nrLDPC_lut* p_lut, t_nrLDPC_procBuf ...@@ -1138,11 +1136,11 @@ static inline void nrLDPC_bn2cnProcBuf_BG1(t_nrLDPC_lut* p_lut, t_nrLDPC_procBuf
// CN group with 10 BNs // CN group with 10 BNs
bitOffsetInGroup = lut_numCnInCnGroups_BG1_R13[7]*NR_LDPC_ZMAX; bitOffsetInGroup = lut_numCnInCnGroups_BG1_R13[7]*NR_LDPC_ZMAX;
//#pragma omp simd
for (j=0; j<9; j++) for (j=0; j<9; j++)
{ {
p_cnProcBuf = &cnProcBuf[lut_startAddrCnGroups[7] + j*bitOffsetInGroup]; p_cnProcBuf = &cnProcBuf[lut_startAddrCnGroups[7] + j*bitOffsetInGroup];
// #pragma omp simd
for (i=0; i<lut_numCnInCnGroups[7]; i++) for (i=0; i<lut_numCnInCnGroups[7]; i++)
{ {
idxBn = lut_startAddrBnProcBuf_CNG10[j][i] + lut_bnPosBnProcBuf_CNG10[j][i]*Z; idxBn = lut_startAddrBnProcBuf_CNG10[j][i] + lut_bnPosBnProcBuf_CNG10[j][i]*Z;
...@@ -1155,11 +1153,11 @@ static inline void nrLDPC_bn2cnProcBuf_BG1(t_nrLDPC_lut* p_lut, t_nrLDPC_procBuf ...@@ -1155,11 +1153,11 @@ static inline void nrLDPC_bn2cnProcBuf_BG1(t_nrLDPC_lut* p_lut, t_nrLDPC_procBuf
// CN group with 19 BNs // CN group with 19 BNs
bitOffsetInGroup = lut_numCnInCnGroups_BG1_R13[8]*NR_LDPC_ZMAX; bitOffsetInGroup = lut_numCnInCnGroups_BG1_R13[8]*NR_LDPC_ZMAX;
//#pragma omp simd
for (j=0; j<19; j++) for (j=0; j<19; j++)
{ {
p_cnProcBuf = &cnProcBuf[lut_startAddrCnGroups[8] + j*bitOffsetInGroup]; p_cnProcBuf = &cnProcBuf[lut_startAddrCnGroups[8] + j*bitOffsetInGroup];
// #pragma omp simd
for (i=0; i<lut_numCnInCnGroups[8]; i++) for (i=0; i<lut_numCnInCnGroups[8]; i++)
{ {
idxBn = lut_startAddrBnProcBuf_CNG19[j][i] + lut_bnPosBnProcBuf_CNG19[j][i]*Z; idxBn = lut_startAddrBnProcBuf_CNG19[j][i] + lut_bnPosBnProcBuf_CNG19[j][i]*Z;
...@@ -1211,3 +1209,5 @@ static inline void nrLDPC_llrRes2llrOut(t_nrLDPC_lut* p_lut, int8_t* llrOut, t_n ...@@ -1211,3 +1209,5 @@ static inline void nrLDPC_llrRes2llrOut(t_nrLDPC_lut* p_lut, int8_t* llrOut, t_n
#endif #endif
...@@ -386,10 +386,10 @@ void nrLDPC_cnProc_BG1_generator_AVX2(int R) ...@@ -386,10 +386,10 @@ void nrLDPC_cnProc_BG1_generator_AVX2(int R)
// Process group with 8 BNs // Process group with 8 BNs
fprintf(fd,"//Process group with 8 BNs\n"); fprintf(fd,"//Process group with 8 BNs\n");
// Offset is 2*384/32 = 24 // Offset is 2*384/32 = 24
const uint8_t lut_idxCnProcG8[8][7] = {{24,48,72,96,112,144,168}, {0,48,72,96,112,144,168}, const uint8_t lut_idxCnProcG8[8][7] = {{24,48,72,96,120,144,168}, {0,48,72,96,120,144,168},
{0,24,72,96,112,144,168}, {0,24,48,96,112,144,168}, {0,24,72,96,120,144,168}, {0,24,48,96,120,144,168},
{0,24,48,72,112,144,168}, {0,24,48,72,96,144,168}, {0,24,48,72,120,144,168}, {0,24,48,72,96,144,168},
{0,24,48,72,96,112,168}, {0,24,48,72,96,112,144}}; {0,24,48,72,96,120,168}, {0,24,48,72,96,120,144}};
......
#include <stdio.h> #include <stdio.h>
#include <stdint.h> #include <stdint.h>
#include "../../nrLDPCdecoder_defs.h" #include "../../nrLDPCdecoder_defs.h"
...@@ -327,14 +325,13 @@ void nrLDPC_cnProc_BG1_generator_AVX512(int R) ...@@ -327,14 +325,13 @@ void nrLDPC_cnProc_BG1_generator_AVX512(int R)
// ===================================================================== // =====================================================================
// Process group with 8 BNs // Process group with 8 BNs
fprintf(fd,"//Process group with 8 BNs\n"); fprintf(fd,"//Process group with 8 BNs\n");
// Offset is 2*384/32 = 12 // Offset is 2*384/32 = 24
const uint8_t lut_idxCnProcG8[8][7] = {{24,48,72,96,112,144,168}, {0,48,72,96,112,144,168}, const uint8_t lut_idxCnProcG8[8][7] = {{24,48,72,96,120,144,168}, {0,48,72,96,120,144,168},
{0,24,72,96,112,144,168}, {0,24,48,96,112,144,168}, {0,24,72,96,120,144,168}, {0,24,48,96,120,144,168},
{0,24,48,72,112,144,168}, {0,24,48,72,96,144,168}, {0,24,48,72,120,144,168}, {0,24,48,72,96,144,168},
{0,24,48,72,96,112,168}, {0,24,48,72,96,112,144}}; {0,24,48,72,96,120,168}, {0,24,48,72,96,120,144}};
if (lut_numCnInCnGroups[5] > 0) if (lut_numCnInCnGroups[5] > 0)
{ {
// Number of groups of 64 CNs for parallel processing // Number of groups of 64 CNs for parallel processing
......
...@@ -11,16 +11,12 @@ static inline void nrLDPC_bnProc_BG1_R89_AVX2(int8_t* bnProcBuf,int8_t* bnProcBu ...@@ -11,16 +11,12 @@ static inline void nrLDPC_bnProc_BG1_R89_AVX2(int8_t* bnProcBuf,int8_t* bnProcBu
p_res = &p_bnProcBufRes[0]; p_res = &p_bnProcBufRes[0];
p_llrRes = (__m256i*) &llrRes [384]; p_llrRes = (__m256i*) &llrRes [384];
for (i=0;i<M;i++) { for (i=0;i<M;i++) {
*p_res = _mm256_subs_epi8(*p_llrRes, p_bnProcBuf[0 + i]); p_res[i] = _mm256_subs_epi8(p_llrRes[i], p_bnProcBuf[0 + i]);
p_res++;
p_llrRes++;
} }
p_res = &p_bnProcBufRes[36]; p_res = &p_bnProcBufRes[36];
p_llrRes = (__m256i*) &llrRes [384]; p_llrRes = (__m256i*) &llrRes [384];
for (i=0;i<M;i++) { for (i=0;i<M;i++) {
*p_res = _mm256_subs_epi8(*p_llrRes, p_bnProcBuf[36 + i]); p_res[i] = _mm256_subs_epi8(p_llrRes[i], p_bnProcBuf[36 + i]);
p_res++;
p_llrRes++;
} }
// Process group with 3 CNs // Process group with 3 CNs
M = (21*Z + 31)>>5; M = (21*Z + 31)>>5;
...@@ -29,23 +25,17 @@ static inline void nrLDPC_bnProc_BG1_R89_AVX2(int8_t* bnProcBuf,int8_t* bnProcBu ...@@ -29,23 +25,17 @@ static inline void nrLDPC_bnProc_BG1_R89_AVX2(int8_t* bnProcBuf,int8_t* bnProcBu
p_res = &p_bnProcBufRes[0]; p_res = &p_bnProcBufRes[0];
p_llrRes = (__m256i*) &llrRes [1536]; p_llrRes = (__m256i*) &llrRes [1536];
for (i=0;i<M;i++) { for (i=0;i<M;i++) {
*p_res = _mm256_subs_epi8(*p_llrRes, p_bnProcBuf[0 + i]); p_res[i] = _mm256_subs_epi8(p_llrRes[i], p_bnProcBuf[0 + i]);
p_res++;
p_llrRes++;
} }
p_res = &p_bnProcBufRes[252]; p_res = &p_bnProcBufRes[252];
p_llrRes = (__m256i*) &llrRes [1536]; p_llrRes = (__m256i*) &llrRes [1536];
for (i=0;i<M;i++) { for (i=0;i<M;i++) {
*p_res = _mm256_subs_epi8(*p_llrRes, p_bnProcBuf[252 + i]); p_res[i] = _mm256_subs_epi8(p_llrRes[i], p_bnProcBuf[252 + i]);
p_res++;
p_llrRes++;
} }
p_res = &p_bnProcBufRes[504]; p_res = &p_bnProcBufRes[504];
p_llrRes = (__m256i*) &llrRes [1536]; p_llrRes = (__m256i*) &llrRes [1536];
for (i=0;i<M;i++) { for (i=0;i<M;i++) {
*p_res = _mm256_subs_epi8(*p_llrRes, p_bnProcBuf[504 + i]); p_res[i] = _mm256_subs_epi8(p_llrRes[i], p_bnProcBuf[504 + i]);
p_res++;
p_llrRes++;
} }
// Process group with 4 CNs // Process group with 4 CNs
M = (1*Z + 31)>>5; M = (1*Z + 31)>>5;
...@@ -54,30 +44,22 @@ static inline void nrLDPC_bnProc_BG1_R89_AVX2(int8_t* bnProcBuf,int8_t* bnProcBu ...@@ -54,30 +44,22 @@ static inline void nrLDPC_bnProc_BG1_R89_AVX2(int8_t* bnProcBuf,int8_t* bnProcBu
p_res = &p_bnProcBufRes[0]; p_res = &p_bnProcBufRes[0];
p_llrRes = (__m256i*) &llrRes [9600]; p_llrRes = (__m256i*) &llrRes [9600];
for (i=0;i<M;i++) { for (i=0;i<M;i++) {
*p_res = _mm256_subs_epi8(*p_llrRes, p_bnProcBuf[0 + i]); p_res[i] = _mm256_subs_epi8(p_llrRes[i], p_bnProcBuf[0 + i]);
p_res++;
p_llrRes++;
} }
p_res = &p_bnProcBufRes[12]; p_res = &p_bnProcBufRes[12];
p_llrRes = (__m256i*) &llrRes [9600]; p_llrRes = (__m256i*) &llrRes [9600];
for (i=0;i<M;i++) { for (i=0;i<M;i++) {
*p_res = _mm256_subs_epi8(*p_llrRes, p_bnProcBuf[12 + i]); p_res[i] = _mm256_subs_epi8(p_llrRes[i], p_bnProcBuf[12 + i]);
p_res++;
p_llrRes++;
} }
p_res = &p_bnProcBufRes[24]; p_res = &p_bnProcBufRes[24];
p_llrRes = (__m256i*) &llrRes [9600]; p_llrRes = (__m256i*) &llrRes [9600];
for (i=0;i<M;i++) { for (i=0;i<M;i++) {
*p_res = _mm256_subs_epi8(*p_llrRes, p_bnProcBuf[24 + i]); p_res[i] = _mm256_subs_epi8(p_llrRes[i], p_bnProcBuf[24 + i]);
p_res++;
p_llrRes++;
} }
p_res = &p_bnProcBufRes[36]; p_res = &p_bnProcBufRes[36];
p_llrRes = (__m256i*) &llrRes [9600]; p_llrRes = (__m256i*) &llrRes [9600];
for (i=0;i<M;i++) { for (i=0;i<M;i++) {
*p_res = _mm256_subs_epi8(*p_llrRes, p_bnProcBuf[36 + i]); p_res[i] = _mm256_subs_epi8(p_llrRes[i], p_bnProcBuf[36 + i]);
p_res++;
p_llrRes++;
} }
// Process group with 5 CNs // Process group with 5 CNs
M = (1*Z + 31)>>5; M = (1*Z + 31)>>5;
...@@ -86,37 +68,27 @@ static inline void nrLDPC_bnProc_BG1_R89_AVX2(int8_t* bnProcBuf,int8_t* bnProcBu ...@@ -86,37 +68,27 @@ static inline void nrLDPC_bnProc_BG1_R89_AVX2(int8_t* bnProcBuf,int8_t* bnProcBu
p_res = &p_bnProcBufRes[0]; p_res = &p_bnProcBufRes[0];
p_llrRes = (__m256i*) &llrRes [9984]; p_llrRes = (__m256i*) &llrRes [9984];
for (i=0;i<M;i++) { for (i=0;i<M;i++) {
*p_res = _mm256_subs_epi8(*p_llrRes, p_bnProcBuf[0 + i]); p_res[i] = _mm256_subs_epi8(p_llrRes[i], p_bnProcBuf[0 + i]);
p_res++;
p_llrRes++;
} }
p_res = &p_bnProcBufRes[12]; p_res = &p_bnProcBufRes[12];
p_llrRes = (__m256i*) &llrRes [9984]; p_llrRes = (__m256i*) &llrRes [9984];
for (i=0;i<M;i++) { for (i=0;i<M;i++) {
*p_res = _mm256_subs_epi8(*p_llrRes, p_bnProcBuf[12 + i]); p_res[i] = _mm256_subs_epi8(p_llrRes[i], p_bnProcBuf[12 + i]);
p_res++;
p_llrRes++;
} }
p_res = &p_bnProcBufRes[24]; p_res = &p_bnProcBufRes[24];
p_llrRes = (__m256i*) &llrRes [9984]; p_llrRes = (__m256i*) &llrRes [9984];
for (i=0;i<M;i++) { for (i=0;i<M;i++) {
*p_res = _mm256_subs_epi8(*p_llrRes, p_bnProcBuf[24 + i]); p_res[i] = _mm256_subs_epi8(p_llrRes[i], p_bnProcBuf[24 + i]);
p_res++;
p_llrRes++;
} }
p_res = &p_bnProcBufRes[36]; p_res = &p_bnProcBufRes[36];
p_llrRes = (__m256i*) &llrRes [9984]; p_llrRes = (__m256i*) &llrRes [9984];
for (i=0;i<M;i++) { for (i=0;i<M;i++) {
*p_res = _mm256_subs_epi8(*p_llrRes, p_bnProcBuf[36 + i]); p_res[i] = _mm256_subs_epi8(p_llrRes[i], p_bnProcBuf[36 + i]);
p_res++;
p_llrRes++;
} }
p_res = &p_bnProcBufRes[48]; p_res = &p_bnProcBufRes[48];
p_llrRes = (__m256i*) &llrRes [9984]; p_llrRes = (__m256i*) &llrRes [9984];
for (i=0;i<M;i++) { for (i=0;i<M;i++) {
*p_res = _mm256_subs_epi8(*p_llrRes, p_bnProcBuf[48 + i]); p_res[i] = _mm256_subs_epi8(p_llrRes[i], p_bnProcBuf[48 + i]);
p_res++;
p_llrRes++;
} }
// Process group with 6 CNs // Process group with 6 CNs
// Process group with 7 CNs // Process group with 7 CNs
......
Markdown is supported
0%
or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment