Commit 0f704894 authored by Sy's avatar Sy

degradation of performance in BLER corrected | bnProc & bnProcPc unrolled

parent 5e5bc1a9
/*
* Licensed to the OpenAirInterface (OAI) Software Alliance under one or more
* contributor license agreements. See the NOTICE file distributed with
......@@ -22,7 +21,7 @@
/*!\file nrLDPC_mPass.h
* \brief Defines the functions for message passing
*
*
*/
#ifndef __NR_LDPC_MPASS__H__
#define __NR_LDPC_MPASS__H__
......@@ -42,7 +41,7 @@
\param Z Lifting size
\param cshift Circular shift
*/
//more faster memcpy by using "rep movsb", which on modern processors is highly optimized
//more faster memcpy by using "rep movsb", which on modern processors is highly optimized
void *memcpy1(void *dst, const void *src, size_t n)
{
......@@ -177,8 +176,7 @@ static inline void nrLDPC_llr2CnProcBuf_BG1(t_nrLDPC_lut* p_lut, int8_t* llr, t_
bitOffsetInGroup = lut_numCnInCnGroups_BG1_R13[0]*NR_LDPC_ZMAX;
// #pragma omp simd
// #pragma omp parallel for schedule(dynamic)
for (j=0; j<3; j++)
{
......@@ -1026,7 +1024,7 @@ static inline void nrLDPC_bn2cnProcBuf_BG1(t_nrLDPC_lut* p_lut, t_nrLDPC_procBuf
// CN group with 3 BNs
bitOffsetInGroup = lut_numCnInCnGroups_BG1_R13[0]*NR_LDPC_ZMAX;
// #pragma omp simd
for (j=0;j<2; j++)
{
p_cnProcBuf = &cnProcBuf[lut_startAddrCnGroups[0] + j*bitOffsetInGroup];
......@@ -1037,11 +1035,11 @@ static inline void nrLDPC_bn2cnProcBuf_BG1(t_nrLDPC_lut* p_lut, t_nrLDPC_procBuf
// CN group with 4 BNs
bitOffsetInGroup = lut_numCnInCnGroups_BG1_R13[1]*NR_LDPC_ZMAX;
// #pragma omp simd
for (j=0; j<3; j++)
{
p_cnProcBuf = &cnProcBuf[lut_startAddrCnGroups[1] + j*bitOffsetInGroup];
// #pragma omp simd
for (i=0; i<lut_numCnInCnGroups[1]; i++)
{
idxBn = lut_startAddrBnProcBuf_CNG4[j][i] + lut_bnPosBnProcBuf_CNG4[j][i]*Z;
......@@ -1053,11 +1051,11 @@ static inline void nrLDPC_bn2cnProcBuf_BG1(t_nrLDPC_lut* p_lut, t_nrLDPC_procBuf
// CN group with 5 BNs
bitOffsetInGroup = lut_numCnInCnGroups_BG1_R13[2]*NR_LDPC_ZMAX;
// #pragma omp simd
for (j=0; j<4; j++)
{
p_cnProcBuf = &cnProcBuf[lut_startAddrCnGroups[2] + j*bitOffsetInGroup];
// #pragma omp simd
for (i=0; i<lut_numCnInCnGroups[2]; i++)
{
idxBn = lut_startAddrBnProcBuf_CNG5[j][i] + lut_bnPosBnProcBuf_CNG5[j][i]*Z;
......@@ -1070,11 +1068,11 @@ static inline void nrLDPC_bn2cnProcBuf_BG1(t_nrLDPC_lut* p_lut, t_nrLDPC_procBuf
// CN group with 6 BNs
bitOffsetInGroup = lut_numCnInCnGroups_BG1_R13[3]*NR_LDPC_ZMAX;
//#pragma omp simd
for (j=0; j<5; j++)
{
p_cnProcBuf = &cnProcBuf[lut_startAddrCnGroups[3] + j*bitOffsetInGroup];
// #pragma omp simd
for (i=0; i<lut_numCnInCnGroups[3]; i++)
{
idxBn = lut_startAddrBnProcBuf_CNG6[j][i] + lut_bnPosBnProcBuf_CNG6[j][i]*Z;
......@@ -1087,12 +1085,12 @@ static inline void nrLDPC_bn2cnProcBuf_BG1(t_nrLDPC_lut* p_lut, t_nrLDPC_procBuf
// CN group with 7 BNs
bitOffsetInGroup = lut_numCnInCnGroups_BG1_R13[4]*NR_LDPC_ZMAX;
//#pragma omp simd
for (j=0; j<6; j++)
{
p_cnProcBuf = &cnProcBuf[lut_startAddrCnGroups[4] + j*bitOffsetInGroup];
// #pragma omp simd
for (i=0; i<lut_numCnInCnGroups[4]; i++)
{
idxBn = lut_startAddrBnProcBuf_CNG7[j][i] + lut_bnPosBnProcBuf_CNG7[j][i]*Z;
......@@ -1105,7 +1103,7 @@ static inline void nrLDPC_bn2cnProcBuf_BG1(t_nrLDPC_lut* p_lut, t_nrLDPC_procBuf
// CN group with 8 BNs
bitOffsetInGroup = lut_numCnInCnGroups_BG1_R13[5]*NR_LDPC_ZMAX;
// #pragma omp simd
for (j=0; j<7; j++)
{
p_cnProcBuf = &cnProcBuf[lut_startAddrCnGroups[5] + j*bitOffsetInGroup];
......@@ -1121,11 +1119,11 @@ static inline void nrLDPC_bn2cnProcBuf_BG1(t_nrLDPC_lut* p_lut, t_nrLDPC_procBuf
// CN group with 9 BNs
bitOffsetInGroup = lut_numCnInCnGroups_BG1_R13[6]*NR_LDPC_ZMAX;
// #pragma omp simd
for (j=0; j<8; j++)
{
p_cnProcBuf = &cnProcBuf[lut_startAddrCnGroups[6] + j*bitOffsetInGroup];
// #pragma omp simd
for (i=0; i<lut_numCnInCnGroups[6]; i++)
{
idxBn = lut_startAddrBnProcBuf_CNG9[j][i] + lut_bnPosBnProcBuf_CNG9[j][i]*Z;
......@@ -1138,11 +1136,11 @@ static inline void nrLDPC_bn2cnProcBuf_BG1(t_nrLDPC_lut* p_lut, t_nrLDPC_procBuf
// CN group with 10 BNs
bitOffsetInGroup = lut_numCnInCnGroups_BG1_R13[7]*NR_LDPC_ZMAX;
//#pragma omp simd
for (j=0; j<9; j++)
{
p_cnProcBuf = &cnProcBuf[lut_startAddrCnGroups[7] + j*bitOffsetInGroup];
// #pragma omp simd
for (i=0; i<lut_numCnInCnGroups[7]; i++)
{
idxBn = lut_startAddrBnProcBuf_CNG10[j][i] + lut_bnPosBnProcBuf_CNG10[j][i]*Z;
......@@ -1155,11 +1153,11 @@ static inline void nrLDPC_bn2cnProcBuf_BG1(t_nrLDPC_lut* p_lut, t_nrLDPC_procBuf
// CN group with 19 BNs
bitOffsetInGroup = lut_numCnInCnGroups_BG1_R13[8]*NR_LDPC_ZMAX;
//#pragma omp simd
for (j=0; j<19; j++)
{
p_cnProcBuf = &cnProcBuf[lut_startAddrCnGroups[8] + j*bitOffsetInGroup];
// #pragma omp simd
for (i=0; i<lut_numCnInCnGroups[8]; i++)
{
idxBn = lut_startAddrBnProcBuf_CNG19[j][i] + lut_bnPosBnProcBuf_CNG19[j][i]*Z;
......@@ -1211,3 +1209,5 @@ static inline void nrLDPC_llrRes2llrOut(t_nrLDPC_lut* p_lut, int8_t* llrOut, t_n
#endif
......@@ -386,10 +386,10 @@ void nrLDPC_cnProc_BG1_generator_AVX2(int R)
// Process group with 8 BNs
fprintf(fd,"//Process group with 8 BNs\n");
// Offset is 2*384/32 = 24
const uint8_t lut_idxCnProcG8[8][7] = {{24,48,72,96,112,144,168}, {0,48,72,96,112,144,168},
{0,24,72,96,112,144,168}, {0,24,48,96,112,144,168},
{0,24,48,72,112,144,168}, {0,24,48,72,96,144,168},
{0,24,48,72,96,112,168}, {0,24,48,72,96,112,144}};
const uint8_t lut_idxCnProcG8[8][7] = {{24,48,72,96,120,144,168}, {0,48,72,96,120,144,168},
{0,24,72,96,120,144,168}, {0,24,48,96,120,144,168},
{0,24,48,72,120,144,168}, {0,24,48,72,96,144,168},
{0,24,48,72,96,120,168}, {0,24,48,72,96,120,144}};
......
#include <stdio.h>
#include <stdint.h>
#include "../../nrLDPCdecoder_defs.h"
......@@ -327,14 +325,13 @@ void nrLDPC_cnProc_BG1_generator_AVX512(int R)
// =====================================================================
// Process group with 8 BNs
fprintf(fd,"//Process group with 8 BNs\n");
// Offset is 2*384/32 = 12
const uint8_t lut_idxCnProcG8[8][7] = {{24,48,72,96,112,144,168}, {0,48,72,96,112,144,168},
{0,24,72,96,112,144,168}, {0,24,48,96,112,144,168},
{0,24,48,72,112,144,168}, {0,24,48,72,96,144,168},
{0,24,48,72,96,112,168}, {0,24,48,72,96,112,144}};
// Offset is 2*384/32 = 24
const uint8_t lut_idxCnProcG8[8][7] = {{24,48,72,96,120,144,168}, {0,48,72,96,120,144,168},
{0,24,72,96,120,144,168}, {0,24,48,96,120,144,168},
{0,24,48,72,120,144,168}, {0,24,48,72,96,144,168},
{0,24,48,72,96,120,168}, {0,24,48,72,96,120,144}};
if (lut_numCnInCnGroups[5] > 0)
{
// Number of groups of 64 CNs for parallel processing
......
......@@ -11,16 +11,12 @@ static inline void nrLDPC_bnProc_BG1_R89_AVX2(int8_t* bnProcBuf,int8_t* bnProcBu
p_res = &p_bnProcBufRes[0];
p_llrRes = (__m256i*) &llrRes [384];
for (i=0;i<M;i++) {
*p_res = _mm256_subs_epi8(*p_llrRes, p_bnProcBuf[0 + i]);
p_res++;
p_llrRes++;
p_res[i] = _mm256_subs_epi8(p_llrRes[i], p_bnProcBuf[0 + i]);
}
p_res = &p_bnProcBufRes[36];
p_llrRes = (__m256i*) &llrRes [384];
for (i=0;i<M;i++) {
*p_res = _mm256_subs_epi8(*p_llrRes, p_bnProcBuf[36 + i]);
p_res++;
p_llrRes++;
p_res[i] = _mm256_subs_epi8(p_llrRes[i], p_bnProcBuf[36 + i]);
}
// Process group with 3 CNs
M = (21*Z + 31)>>5;
......@@ -29,23 +25,17 @@ static inline void nrLDPC_bnProc_BG1_R89_AVX2(int8_t* bnProcBuf,int8_t* bnProcBu
p_res = &p_bnProcBufRes[0];
p_llrRes = (__m256i*) &llrRes [1536];
for (i=0;i<M;i++) {
*p_res = _mm256_subs_epi8(*p_llrRes, p_bnProcBuf[0 + i]);
p_res++;
p_llrRes++;
p_res[i] = _mm256_subs_epi8(p_llrRes[i], p_bnProcBuf[0 + i]);
}
p_res = &p_bnProcBufRes[252];
p_llrRes = (__m256i*) &llrRes [1536];
for (i=0;i<M;i++) {
*p_res = _mm256_subs_epi8(*p_llrRes, p_bnProcBuf[252 + i]);
p_res++;
p_llrRes++;
p_res[i] = _mm256_subs_epi8(p_llrRes[i], p_bnProcBuf[252 + i]);
}
p_res = &p_bnProcBufRes[504];
p_llrRes = (__m256i*) &llrRes [1536];
for (i=0;i<M;i++) {
*p_res = _mm256_subs_epi8(*p_llrRes, p_bnProcBuf[504 + i]);
p_res++;
p_llrRes++;
p_res[i] = _mm256_subs_epi8(p_llrRes[i], p_bnProcBuf[504 + i]);
}
// Process group with 4 CNs
M = (1*Z + 31)>>5;
......@@ -54,30 +44,22 @@ static inline void nrLDPC_bnProc_BG1_R89_AVX2(int8_t* bnProcBuf,int8_t* bnProcBu
p_res = &p_bnProcBufRes[0];
p_llrRes = (__m256i*) &llrRes [9600];
for (i=0;i<M;i++) {
*p_res = _mm256_subs_epi8(*p_llrRes, p_bnProcBuf[0 + i]);
p_res++;
p_llrRes++;
p_res[i] = _mm256_subs_epi8(p_llrRes[i], p_bnProcBuf[0 + i]);
}
p_res = &p_bnProcBufRes[12];
p_llrRes = (__m256i*) &llrRes [9600];
for (i=0;i<M;i++) {
*p_res = _mm256_subs_epi8(*p_llrRes, p_bnProcBuf[12 + i]);
p_res++;
p_llrRes++;
p_res[i] = _mm256_subs_epi8(p_llrRes[i], p_bnProcBuf[12 + i]);
}
p_res = &p_bnProcBufRes[24];
p_llrRes = (__m256i*) &llrRes [9600];
for (i=0;i<M;i++) {
*p_res = _mm256_subs_epi8(*p_llrRes, p_bnProcBuf[24 + i]);
p_res++;
p_llrRes++;
p_res[i] = _mm256_subs_epi8(p_llrRes[i], p_bnProcBuf[24 + i]);
}
p_res = &p_bnProcBufRes[36];
p_llrRes = (__m256i*) &llrRes [9600];
for (i=0;i<M;i++) {
*p_res = _mm256_subs_epi8(*p_llrRes, p_bnProcBuf[36 + i]);
p_res++;
p_llrRes++;
p_res[i] = _mm256_subs_epi8(p_llrRes[i], p_bnProcBuf[36 + i]);
}
// Process group with 5 CNs
M = (1*Z + 31)>>5;
......@@ -86,37 +68,27 @@ static inline void nrLDPC_bnProc_BG1_R89_AVX2(int8_t* bnProcBuf,int8_t* bnProcBu
p_res = &p_bnProcBufRes[0];
p_llrRes = (__m256i*) &llrRes [9984];
for (i=0;i<M;i++) {
*p_res = _mm256_subs_epi8(*p_llrRes, p_bnProcBuf[0 + i]);
p_res++;
p_llrRes++;
p_res[i] = _mm256_subs_epi8(p_llrRes[i], p_bnProcBuf[0 + i]);
}
p_res = &p_bnProcBufRes[12];
p_llrRes = (__m256i*) &llrRes [9984];
for (i=0;i<M;i++) {
*p_res = _mm256_subs_epi8(*p_llrRes, p_bnProcBuf[12 + i]);
p_res++;
p_llrRes++;
p_res[i] = _mm256_subs_epi8(p_llrRes[i], p_bnProcBuf[12 + i]);
}
p_res = &p_bnProcBufRes[24];
p_llrRes = (__m256i*) &llrRes [9984];
for (i=0;i<M;i++) {
*p_res = _mm256_subs_epi8(*p_llrRes, p_bnProcBuf[24 + i]);
p_res++;
p_llrRes++;
p_res[i] = _mm256_subs_epi8(p_llrRes[i], p_bnProcBuf[24 + i]);
}
p_res = &p_bnProcBufRes[36];
p_llrRes = (__m256i*) &llrRes [9984];
for (i=0;i<M;i++) {
*p_res = _mm256_subs_epi8(*p_llrRes, p_bnProcBuf[36 + i]);
p_res++;
p_llrRes++;
p_res[i] = _mm256_subs_epi8(p_llrRes[i], p_bnProcBuf[36 + i]);
}
p_res = &p_bnProcBufRes[48];
p_llrRes = (__m256i*) &llrRes [9984];
for (i=0;i<M;i++) {
*p_res = _mm256_subs_epi8(*p_llrRes, p_bnProcBuf[48 + i]);
p_res++;
p_llrRes++;
p_res[i] = _mm256_subs_epi8(p_llrRes[i], p_bnProcBuf[48 + i]);
}
// Process group with 6 CNs
// Process group with 7 CNs
......
Markdown is supported
0%
or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment