Commit f6bb869c authored by Raymond Knopp's avatar Raymond Knopp

fixed AVX2 issue for cnProc code generator. Some cleanup in formatting and deleting unused files.

parent aabd9c6c
/*
* Licensed to the OpenAirInterface (OAI) Software Alliance under one or more
* contributor license agreements. See the NOTICE file distributed with
* this work for additional information regarding copyright ownership.
* The OpenAirInterface Software Alliance licenses this file to You under
* the OAI Public License, Version 1.1 (the "License"); you may not use this file
* except in compliance with the License.
* You may obtain a copy of the License at
*
* http://www.openairinterface.org/?page_id=698
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*-------------------------------------------------------------------------------
* For more information about the OpenAirInterface (OAI) Software Alliance:
* contact@openairinterface.org
*/
/*!\file nrLDPC_decoder.c
* \brief Defines the LDPC decoder
* \author Sebastian Wagner (TCL Communications) Email: <mailto:sebastian.wagner@tcl.com>
* \date 30-09-2019
* \version 2.0
* \note
* \warning
*/
#include <stdint.h>
#include <immintrin.h>
#include "nrLDPCdecoder_defs.h"
#include "nrLDPC_types.h"
#include "nrLDPC_init.h"
#include "nrLDPC_mPass.h"
#include "nrLDPC_cnProc.h"
#include "nrLDPC_bnProc.h"
#define NR_LDPC_ENABLE_PARITY_CHECK
#define NR_LDPC_PROFILER_DETAIL
#ifdef NR_LDPC_DEBUG_MODE
#include "nrLDPC_tools/nrLDPC_debug.h"
#endif
static inline uint32_t nrLDPC_decoder_core(int8_t* p_llr, int8_t* p_out, t_nrLDPC_procBuf* p_procBuf, uint32_t numLLR, t_nrLDPC_lut* p_lut, t_nrLDPC_dec_params* p_decParams, t_nrLDPC_time_stats* p_profiler);
int32_t nrLDPC_decod(t_nrLDPC_dec_params* p_decParams, int8_t* p_llr, int8_t* p_out, t_nrLDPC_procBuf* p_procBuf, t_nrLDPC_time_stats* p_profiler)
{
uint32_t numLLR;
uint32_t numIter = 0;
t_nrLDPC_lut lut;
t_nrLDPC_lut* p_lut = &lut;
//printf("p_procBuf->cnProcBuf = %p\n", p_procBuf->cnProcBuf);
// Initialize decoder core(s) with correct LUTs
numLLR = nrLDPC_init(p_decParams, p_lut);
// Launch LDPC decoder core for one segment
numIter = nrLDPC_decoder_core(p_llr, p_out, p_procBuf, numLLR, p_lut, p_decParams, p_profiler);
return numIter;
}
/**
\brief Performs LDPC decoding of one code block
\param p_llr Input LLRs
\param p_out Output vector
\param numLLR Number of LLRs
\param p_lut Pointer to decoder LUTs
\param p_decParams LDPC decoder parameters
\param p_profiler LDPC profiler statistics
*/
static inline uint32_t nrLDPC_decoder_core(int8_t* p_llr, int8_t* p_out, t_nrLDPC_procBuf* p_procBuf, uint32_t numLLR, t_nrLDPC_lut* p_lut, t_nrLDPC_dec_params* p_decParams, t_nrLDPC_time_stats* p_profiler)
{
uint16_t Z = p_decParams->Z;
uint8_t BG = p_decParams->BG;
uint8_t numMaxIter = p_decParams->numMaxIter;
e_nrLDPC_outMode outMode = p_decParams->outMode;
// Minimum number of iterations is 1
// 0 iterations means hard-decision on input LLRs
uint32_t i = 1;
// Initialize with parity check fail != 0
int32_t pcRes = 1;
int8_t* p_llrOut;
if (outMode == nrLDPC_outMode_LLRINT8)
{
p_llrOut = p_out;
}
else
{
// Use LLR processing buffer as temporary output buffer
p_llrOut = p_procBuf->llrProcBuf;
// Clear llrProcBuf
memset(p_llrOut,0, NR_LDPC_MAX_NUM_LLR*sizeof(int8_t));
}
// Initialization
#ifdef NR_LDPC_PROFILER_DETAIL
start_meas(&p_profiler->llr2llrProcBuf);
#endif
nrLDPC_llr2llrProcBuf(p_lut, p_llr, p_procBuf, Z, BG);
#ifdef NR_LDPC_PROFILER_DETAIL
stop_meas(&p_profiler->llr2llrProcBuf);
#endif
#ifdef NR_LDPC_DEBUG_MODE
nrLDPC_debug_initBuffer2File(nrLDPC_buffers_LLR_PROC);
nrLDPC_debug_writeBuffer2File(nrLDPC_buffers_LLR_PROC, p_procBuf);
#endif
#ifdef NR_LDPC_PROFILER_DETAIL
start_meas(&p_profiler->llr2CnProcBuf);
#endif
if (BG == 1)
{
nrLDPC_llr2CnProcBuf_BG1(p_lut, p_llr, p_procBuf, Z);
}
else
{
nrLDPC_llr2CnProcBuf_BG2(p_lut, p_llr, p_procBuf, Z);
}
#ifdef NR_LDPC_PROFILER_DETAIL
stop_meas(&p_profiler->llr2CnProcBuf);
#endif
#ifdef NR_LDPC_DEBUG_MODE
nrLDPC_debug_initBuffer2File(nrLDPC_buffers_CN_PROC);
nrLDPC_debug_writeBuffer2File(nrLDPC_buffers_CN_PROC, p_procBuf);
#endif
// First iteration
// CN processing
#ifdef NR_LDPC_PROFILER_DETAIL
start_meas(&p_profiler->cnProc);
#endif
if (BG == 1)
{
#ifdef __AVX512BW__
nrLDPC_cnProc_BG1_AVX512(p_lut, p_procBuf, Z);
#else
nrLDPC_cnProc_BG1(p_lut, p_procBuf, Z);
#endif
}
else
{
nrLDPC_cnProc_BG2(p_lut, p_procBuf, Z);
}
#ifdef NR_LDPC_PROFILER_DETAIL
stop_meas(&p_profiler->cnProc);
#endif
#ifdef NR_LDPC_DEBUG_MODE
nrLDPC_debug_initBuffer2File(nrLDPC_buffers_CN_PROC_RES);
nrLDPC_debug_writeBuffer2File(nrLDPC_buffers_CN_PROC_RES, p_procBuf);
#endif
#ifdef NR_LDPC_PROFILER_DETAIL
start_meas(&p_profiler->cn2bnProcBuf);
#endif
if (BG == 1)
{
nrLDPC_cn2bnProcBuf_BG1(p_lut, p_procBuf, Z);
}
else
{
nrLDPC_cn2bnProcBuf_BG2(p_lut, p_procBuf, Z);
}
#ifdef NR_LDPC_PROFILER_DETAIL
stop_meas(&p_profiler->cn2bnProcBuf);
#endif
#ifdef NR_LDPC_DEBUG_MODE
nrLDPC_debug_initBuffer2File(nrLDPC_buffers_BN_PROC);
nrLDPC_debug_writeBuffer2File(nrLDPC_buffers_BN_PROC, p_procBuf);
#endif
// BN processing
#ifdef NR_LDPC_PROFILER_DETAIL
start_meas(&p_profiler->bnProcPc);
#endif
nrLDPC_bnProcPc(p_lut, p_procBuf, Z);
#ifdef NR_LDPC_PROFILER_DETAIL
stop_meas(&p_profiler->bnProcPc);
#endif
#ifdef NR_LDPC_DEBUG_MODE
nrLDPC_debug_initBuffer2File(nrLDPC_buffers_LLR_RES);
nrLDPC_debug_writeBuffer2File(nrLDPC_buffers_LLR_RES, p_procBuf);
#endif
#ifdef NR_LDPC_PROFILER_DETAIL
start_meas(&p_profiler->bnProc);
#endif
nrLDPC_bnProc(p_lut, p_procBuf, Z);
#ifdef NR_LDPC_PROFILER_DETAIL
stop_meas(&p_profiler->bnProc);
#endif
#ifdef NR_LDPC_DEBUG_MODE
nrLDPC_debug_initBuffer2File(nrLDPC_buffers_BN_PROC_RES);
nrLDPC_debug_writeBuffer2File(nrLDPC_buffers_BN_PROC_RES, p_procBuf);
#endif
// BN results to CN processing buffer
#ifdef NR_LDPC_PROFILER_DETAIL
start_meas(&p_profiler->bn2cnProcBuf);
#endif
if (BG == 1)
{
nrLDPC_bn2cnProcBuf_BG1(p_lut, p_procBuf, Z);
}
else
{
nrLDPC_bn2cnProcBuf_BG2(p_lut, p_procBuf, Z);
}
#ifdef NR_LDPC_PROFILER_DETAIL
stop_meas(&p_profiler->bn2cnProcBuf);
#endif
#ifdef NR_LDPC_DEBUG_MODE
nrLDPC_debug_writeBuffer2File(nrLDPC_buffers_CN_PROC, p_procBuf);
#endif
// Parity Check not necessary here since it will fail
// because first 2 cols/BNs in BG are punctured and cannot be
// estimated after only one iteration
// First iteration finished
while ( (i < (numMaxIter-1)) && (pcRes != 0) )
{
// Increase iteration counter
i++;
// CN processing
#ifdef NR_LDPC_PROFILER_DETAIL
start_meas(&p_profiler->cnProc);
#endif
if (BG == 1)
{
#ifdef __AVX512BW__
nrLDPC_cnProc_BG1_AVX512(p_lut, p_procBuf, Z);
#else
nrLDPC_cnProc_BG1(p_lut, p_procBuf, Z);
#endif
}
else
{
nrLDPC_cnProc_BG2(p_lut, p_procBuf, Z);
}
#ifdef NR_LDPC_PROFILER_DETAIL
stop_meas(&p_profiler->cnProc);
#endif
#ifdef NR_LDPC_DEBUG_MODE
nrLDPC_debug_writeBuffer2File(nrLDPC_buffers_CN_PROC_RES, p_procBuf);
#endif
// Send CN results back to BNs
#ifdef NR_LDPC_PROFILER_DETAIL
start_meas(&p_profiler->cn2bnProcBuf);
#endif
if (BG == 1)
{
nrLDPC_cn2bnProcBuf_BG1(p_lut, p_procBuf, Z);
}
else
{
nrLDPC_cn2bnProcBuf_BG2(p_lut, p_procBuf, Z);
}
#ifdef NR_LDPC_PROFILER_DETAIL
stop_meas(&p_profiler->cn2bnProcBuf);
#endif
#ifdef NR_LDPC_DEBUG_MODE
nrLDPC_debug_writeBuffer2File(nrLDPC_buffers_BN_PROC, p_procBuf);
#endif
// BN Processing
#ifdef NR_LDPC_PROFILER_DETAIL
start_meas(&p_profiler->bnProcPc);
#endif
nrLDPC_bnProcPc(p_lut, p_procBuf, Z);
#ifdef NR_LDPC_PROFILER_DETAIL
stop_meas(&p_profiler->bnProcPc);
#endif
#ifdef NR_LDPC_DEBUG_MODE
nrLDPC_debug_writeBuffer2File(nrLDPC_buffers_LLR_RES, p_procBuf);
#endif
#ifdef NR_LDPC_PROFILER_DETAIL
start_meas(&p_profiler->bnProc);
#endif
nrLDPC_bnProc(p_lut, p_procBuf, Z);
#ifdef NR_LDPC_PROFILER_DETAIL
stop_meas(&p_profiler->bnProc);
#endif
#ifdef NR_LDPC_DEBUG_MODE
nrLDPC_debug_writeBuffer2File(nrLDPC_buffers_BN_PROC_RES, p_procBuf);
#endif
// BN results to CN processing buffer
#ifdef NR_LDPC_PROFILER_DETAIL
start_meas(&p_profiler->bn2cnProcBuf);
#endif
if (BG == 1)
{
nrLDPC_bn2cnProcBuf_BG1(p_lut, p_procBuf, Z);
}
else
{
nrLDPC_bn2cnProcBuf_BG2(p_lut, p_procBuf, Z);
}
#ifdef NR_LDPC_PROFILER_DETAIL
stop_meas(&p_profiler->bn2cnProcBuf);
#endif
#ifdef NR_LDPC_DEBUG_MODE
nrLDPC_debug_writeBuffer2File(nrLDPC_buffers_CN_PROC, p_procBuf);
#endif
// Parity Check
#ifdef NR_LDPC_ENABLE_PARITY_CHECK
#ifdef NR_LDPC_PROFILER_DETAIL
start_meas(&p_profiler->cnProcPc);
#endif
if (BG == 1)
{
pcRes = nrLDPC_cnProcPc_BG1(p_lut, p_procBuf, Z);
}
else
{
pcRes = nrLDPC_cnProcPc_BG2(p_lut, p_procBuf, Z);
}
#ifdef NR_LDPC_PROFILER_DETAIL
stop_meas(&p_profiler->cnProcPc);
#endif
#endif
}
// Last iteration
if ( (i < numMaxIter) && (pcRes != 0) )
{
// Increase iteration counter
i++;
// CN processing
#ifdef NR_LDPC_PROFILER_DETAIL
start_meas(&p_profiler->cnProc);
#endif
if (BG == 1)
{
#ifdef __AVX512BW__
nrLDPC_cnProc_BG1_AVX512(p_lut, p_procBuf, Z);
#else
nrLDPC_cnProc_BG1(p_lut, p_procBuf, Z);
#endif
}
else
{
nrLDPC_cnProc_BG2(p_lut, p_procBuf, Z);
}
#ifdef NR_LDPC_PROFILER_DETAIL
stop_meas(&p_profiler->cnProc);
#endif
#ifdef NR_LDPC_DEBUG_MODE
nrLDPC_debug_writeBuffer2File(nrLDPC_buffers_CN_PROC_RES, p_procBuf);
#endif
// Send CN results back to BNs
#ifdef NR_LDPC_PROFILER_DETAIL
start_meas(&p_profiler->cn2bnProcBuf);
#endif
if (BG == 1)
{
nrLDPC_cn2bnProcBuf_BG1(p_lut, p_procBuf, Z);
}
else
{
nrLDPC_cn2bnProcBuf_BG2(p_lut, p_procBuf, Z);
}
#ifdef NR_LDPC_PROFILER_DETAIL
stop_meas(&p_profiler->cn2bnProcBuf);
#endif
#ifdef NR_LDPC_DEBUG_MODE
nrLDPC_debug_writeBuffer2File(nrLDPC_buffers_BN_PROC, p_procBuf);
#endif
// BN Processing
#ifdef NR_LDPC_PROFILER_DETAIL
start_meas(&p_profiler->bnProcPc);
#endif
nrLDPC_bnProcPc(p_lut, p_procBuf, Z);
#ifdef NR_LDPC_PROFILER_DETAIL
stop_meas(&p_profiler->bnProcPc);
#endif
#ifdef NR_LDPC_DEBUG_MODE
nrLDPC_debug_writeBuffer2File(nrLDPC_buffers_LLR_RES, p_procBuf);
#endif
// If parity check not enabled, no need to send the BN proc results
// back to CNs
#ifdef NR_LDPC_ENABLE_PARITY_CHECK
#ifdef NR_LDPC_PROFILER_DETAIL
start_meas(&p_profiler->bnProc);
#endif
nrLDPC_bnProc(p_lut, p_procBuf, Z);
#ifdef NR_LDPC_PROFILER_DETAIL
stop_meas(&p_profiler->bnProc);
#endif
#ifdef NR_LDPC_DEBUG_MODE
nrLDPC_debug_writeBuffer2File(nrLDPC_buffers_BN_PROC_RES, p_procBuf);
#endif
// BN results to CN processing buffer
#ifdef NR_LDPC_PROFILER_DETAIL
start_meas(&p_profiler->bn2cnProcBuf);
#endif
if (BG == 1)
{
nrLDPC_bn2cnProcBuf_BG1(p_lut, p_procBuf, Z);
}
else
{
nrLDPC_bn2cnProcBuf_BG2(p_lut, p_procBuf, Z);
}
#ifdef NR_LDPC_PROFILER_DETAIL
stop_meas(&p_profiler->bn2cnProcBuf);
#endif
#ifdef NR_LDPC_DEBUG_MODE
nrLDPC_debug_writeBuffer2File(nrLDPC_buffers_CN_PROC, p_procBuf);
#endif
// Parity Check
#ifdef NR_LDPC_PROFILER_DETAIL
start_meas(&p_profiler->cnProcPc);
#endif
if (BG == 1)
{
pcRes = nrLDPC_cnProcPc_BG1(p_lut, p_procBuf, Z);
}
else
{
pcRes = nrLDPC_cnProcPc_BG2(p_lut, p_procBuf, Z);
}
#ifdef NR_LDPC_PROFILER_DETAIL
stop_meas(&p_profiler->cnProcPc);
#endif
#endif
}
// If maximum number of iterations reached an PC still fails increase number of iterations
// Thus, i > numMaxIter indicates that PC has failed
#ifdef NR_LDPC_ENABLE_PARITY_CHECK
if (pcRes != 0)
{
i++;
}
#endif
// Assign results from processing buffer to output
#ifdef NR_LDPC_PROFILER_DETAIL
start_meas(&p_profiler->llrRes2llrOut);
#endif
nrLDPC_llrRes2llrOut(p_lut, p_llrOut, p_procBuf, Z, BG);
#ifdef NR_LDPC_PROFILER_DETAIL
stop_meas(&p_profiler->llrRes2llrOut);
#endif
// Hard-decision
#ifdef NR_LDPC_PROFILER_DETAIL
start_meas(&p_profiler->llr2bit);
#endif
if (outMode == nrLDPC_outMode_BIT)
{
nrLDPC_llr2bitPacked(p_out, p_llrOut, numLLR);
}
else if (outMode == nrLDPC_outMode_BITINT8)
{
nrLDPC_llr2bit(p_out, p_llrOut, numLLR);
}
#ifdef NR_LDPC_PROFILER_DETAIL
stop_meas(&p_profiler->llr2bit);
#endif
return i;
}
......@@ -39,337 +39,9 @@
\param Z Lifting size
*/
#define conditional_negate(a,b,z) _mm512_mask_sub_epi8(a,_mm512_movepi8_mask(b),z,a)
#ifdef __AVX512BW__
static inline void nrLDPC_cnProc_BG2_AVX512(t_nrLDPC_lut* p_lut, int8_t* cnProcBuf, int8_t* cnProcBufRes, uint16_t Z)
{
const uint8_t* lut_numCnInCnGroups = p_lut->numCnInCnGroups;
const uint32_t* lut_startAddrCnGroups = p_lut->startAddrCnGroups;
__m512i* p_cnProcBuf;
__m512i* p_cnProcBufRes;
// Number of CNs in Groups
uint32_t M;
uint32_t i;
uint32_t j;
uint32_t k;
// Offset to each bit within a group in terms of 32 Byte
uint32_t bitOffsetInGroup;
__m512i zmm0, min, sgn, zeros;
zeros = _mm512_setzero_si512();
// maxLLR = _mm512_set1_epi8((char)127);
__m512i* p_cnProcBufResBit;
const __m512i* p_ones = (__m512i*) ones512_epi8;
const __m512i* p_maxLLR = (__m512i*) maxLLR512_epi8;
// LUT with offsets for bits that need to be processed
// 1. bit proc requires LLRs of 2. and 3. bit, 2.bits of 1. and 3. etc.
// Offsets are in units of bitOffsetInGroup
const uint8_t lut_idxCnProcG3[3][2] = {{72,144}, {0,144}, {0,72}};
// =====================================================================
// Process group with 3 BNs
if (lut_numCnInCnGroups[0] > 0)
{
// Number of groups of 32 CNs for parallel processing
// Ceil for values not divisible by 32
M = (lut_numCnInCnGroups[0]*Z + 63)>>6;
// Set the offset to each bit within a group in terms of 32 Byte
bitOffsetInGroup = (lut_numCnInCnGroups_BG2_R15[0]*NR_LDPC_ZMAX)>>6;
// Set pointers to start of group 3
p_cnProcBuf = (__m512i*) &cnProcBuf [lut_startAddrCnGroups[0]];
p_cnProcBufRes = (__m512i*) &cnProcBufRes[lut_startAddrCnGroups[0]];
// Loop over every BN
for (j=0; j<3; j++)
{
// Set of results pointer to correct BN address
p_cnProcBufResBit = p_cnProcBufRes + (j*bitOffsetInGroup);
__m512i *pj0 = &p_cnProcBuf[(lut_idxCnProcG3[j][0]/2)];
__m512i *pj1 = &p_cnProcBuf[(lut_idxCnProcG3[j][1]/2)];
// Loop over CNs
for (i=0; i<M; i++)
{
// Abs and sign of 32 CNs (first BN)
// zmm0 = p_cnProcBuf[lut_idxCnProcG3[j][0] + i];
zmm0 = pj0[i];
sgn = _mm512_xor_si512(*p_ones, zmm0);
min = _mm512_abs_epi8(zmm0);
// 32 CNs of second BN
// zmm0 = p_cnProcBuf[(lut_idxCnProcG3[j][1]/2) + i];
zmm0 = pj1[i];
min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0));
sgn = _mm512_xor_si512(sgn, zmm0);
// Store result
min = _mm512_min_epu8(min, *p_maxLLR); // 128 in epi8 is -127
*p_cnProcBufResBit = conditional_negate(min, sgn,zeros);
p_cnProcBufResBit++;
//p_cnProcBufResBit[i]=_mm512_sign_epi8(min, sgn);
}
}
}
// =====================================================================
// Process group with 4 BNs
// Offset is 20*384/32 = 240
const uint16_t lut_idxCnProcG4[4][3] = {{240,480,720}, {0,480,720}, {0,240,720}, {0,240,480}};
if (lut_numCnInCnGroups[1] > 0)
{
// Number of groups of 32 CNs for parallel processing
// Ceil for values not divisible by 32
M = (lut_numCnInCnGroups[1]*Z + 63)>>6;
// Set the offset to each bit within a group in terms of 32 Byte
bitOffsetInGroup = (lut_numCnInCnGroups_BG2_R15[1]*NR_LDPC_ZMAX)>>6;
// Set pointers to start of group 4
p_cnProcBuf = (__m512i*) &cnProcBuf [lut_startAddrCnGroups[1]];
p_cnProcBufRes = (__m512i*) &cnProcBufRes[lut_startAddrCnGroups[1]];
// Loop over every BN
for (j=0; j<4; j++)
{
// Set of results pointer to correct BN address
p_cnProcBufResBit = p_cnProcBufRes + (j*bitOffsetInGroup);
// Loop over CNs
for (i=0; i<M; i++)
{
// Abs and sign of 32 CNs (first BN)
zmm0 = p_cnProcBuf[(lut_idxCnProcG4[j][0]/2) + i];
sgn = _mm512_xor_si512(*p_ones, zmm0);
min = _mm512_abs_epi8(zmm0);
// Loop over BNs
for (k=1; k<3; k++)
{
zmm0 = p_cnProcBuf[(lut_idxCnProcG4[j][k]/2) + i];
min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0));
sgn = _mm512_xor_si512(sgn, zmm0);
}
// Store result
min = _mm512_min_epu8(min, *p_maxLLR); // 128 in epi8 is -127
*p_cnProcBufResBit = conditional_negate(min, sgn,zeros);
p_cnProcBufResBit++;
}
}
}
// =====================================================================
// Process group with 5 BNs
// Offset is 9*384/32 = 108
const uint16_t lut_idxCnProcG5[5][4] = {{108,216,324,432}, {0,216,324,432},
{0,108,324,432}, {0,108,216,432}, {0,108,216,324}};
if (lut_numCnInCnGroups[2] > 0)
{
// Number of groups of 32 CNs for parallel processing
// Ceil for values not divisible by 32
M = (lut_numCnInCnGroups[2]*Z + 63)>>6;
// Set the offset to each bit within a group in terms of 32 Byte
bitOffsetInGroup = (lut_numCnInCnGroups_BG2_R15[2]*NR_LDPC_ZMAX)>>6;
// Set pointers to start of group 5
p_cnProcBuf = (__m512i*) &cnProcBuf [lut_startAddrCnGroups[2]];
p_cnProcBufRes = (__m512i*) &cnProcBufRes[lut_startAddrCnGroups[2]];
// Loop over every BN
for (j=0; j<5; j++)
{
// Set of results pointer to correct BN address
p_cnProcBufResBit = p_cnProcBufRes + (j*bitOffsetInGroup);
// Loop over CNs
for (i=0; i<M; i++)
{
// Abs and sign of 32 CNs (first BN)
zmm0 = p_cnProcBuf[(lut_idxCnProcG5[j][0]/2) + i];
sgn = _mm512_xor_si512(*p_ones, zmm0);
min = _mm512_abs_epi8(zmm0);
// Loop over BNs
for (k=1; k<4; k++)
{
zmm0 = p_cnProcBuf[(lut_idxCnProcG5[j][k]/2) + i];
min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0));
sgn = _mm512_xor_si512(sgn, zmm0);
}
// Store result
min = _mm512_min_epu8(min, *p_maxLLR); // 128 in epi8 is -127
*p_cnProcBufResBit = conditional_negate(min, sgn,zeros);
p_cnProcBufResBit++;
}
}
}
// =====================================================================
// Process group with 6 BNs
// Offset is 3*384/32 = 36
const uint16_t lut_idxCnProcG6[6][5] = {{36,72,108,144,180}, {0,72,108,144,180},
{0,36,108,144,180}, {0,36,72,144,180},
{0,36,72,108,180}, {0,36,72,108,144}};
if (lut_numCnInCnGroups[3] > 0)
{
// Number of groups of 32 CNs for parallel processing
// Ceil for values not divisible by 32
M = (lut_numCnInCnGroups[3]*Z + 63)>>6;
// Set the offset to each bit within a group in terms of 32 Byte
bitOffsetInGroup = (lut_numCnInCnGroups_BG2_R15[3]*NR_LDPC_ZMAX)>>6;
// Set pointers to start of group 6
p_cnProcBuf = (__m512i*) &cnProcBuf [lut_startAddrCnGroups[3]];
p_cnProcBufRes = (__m512i*) &cnProcBufRes[lut_startAddrCnGroups[3]];
// Loop over every BN
for (j=0; j<6; j++)
{
// Set of results pointer to correct BN address
p_cnProcBufResBit = p_cnProcBufRes + (j*bitOffsetInGroup);
// Loop over CNs
for (i=0; i<M; i++)
{
// Abs and sign of 32 CNs (first BN)
zmm0 = p_cnProcBuf[(lut_idxCnProcG6[j][0]/2) + i];
sgn = _mm512_xor_si512(*p_ones, zmm0);
min = _mm512_abs_epi8(zmm0);
// Loop over BNs
for (k=1; k<5; k++)
{
zmm0 = p_cnProcBuf[(lut_idxCnProcG6[j][k]/2) + i];
min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0));
sgn = _mm512_xor_si512(sgn, zmm0);
}
// Store result
min = _mm512_min_epu8(min, *p_maxLLR); // 128 in epi8 is -127
*p_cnProcBufResBit = conditional_negate(min, sgn,zeros);
p_cnProcBufResBit++;
}
}
}
// =====================================================================
// Process group with 8 BNs
// Offset is 2*384/32 = 24
const uint8_t lut_idxCnProcG8[8][7] = {{24,48,72,96,120,144,168}, {0,48,72,96,120,144,168},
{0,24,72,96,120,144,168}, {0,24,48,96,120,144,168},
{0,24,48,72,120,144,168}, {0,24,48,72,96,144,168},
{0,24,48,72,96,120,168}, {0,24,48,72,96,120,144}};
if (lut_numCnInCnGroups[4] > 0)
{
// Number of groups of 32 CNs for parallel processing
// Ceil for values not divisible by 32
M = (lut_numCnInCnGroups[4]*Z + 63)>>6;
// Set the offset to each bit within a group in terms of 32 Byte
bitOffsetInGroup = (lut_numCnInCnGroups_BG2_R15[4]*NR_LDPC_ZMAX)>>6;
// Set pointers to start of group 8
p_cnProcBuf = (__m512i*) &cnProcBuf [lut_startAddrCnGroups[4]];
p_cnProcBufRes = (__m512i*) &cnProcBufRes[lut_startAddrCnGroups[4]];
// Loop over every BN
for (j=0; j<8; j++)
{
// Set of results pointer to correct BN address
p_cnProcBufResBit = p_cnProcBufRes + (j*bitOffsetInGroup);
// Loop over CNs
for (i=0; i<M; i++)
{
// Abs and sign of 32 CNs (first BN)
zmm0 = p_cnProcBuf[(lut_idxCnProcG8[j][0]/2) + i];
sgn = _mm512_xor_si512(*p_ones, zmm0);
min = _mm512_abs_epi8(zmm0);
// Loop over BNs
for (k=1; k<7; k++)
{
zmm0 = p_cnProcBuf[(lut_idxCnProcG8[j][k]/2) + i];
min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0));
sgn = _mm512_xor_si512(sgn, zmm0);
}
// Store result
min = _mm512_min_epu8(min, *p_maxLLR); // 128 in epi8 is -127
*p_cnProcBufResBit = conditional_negate(min, sgn,zeros);
p_cnProcBufResBit++;
}
}
}
// =====================================================================
// Process group with 10 BNs
// Offset is 2*384/32 = 24
const uint8_t lut_idxCnProcG10[10][9] = {{24,48,72,96,120,144,168,192,216}, {0,48,72,96,120,144,168,192,216},
{0,24,72,96,120,144,168,192,216}, {0,24,48,96,120,144,168,192,216},
{0,24,48,72,120,144,168,192,216}, {0,24,48,72,96,144,168,192,216},
{0,24,48,72,96,120,168,192,216}, {0,24,48,72,96,120,144,192,216},
{0,24,48,72,96,120,144,168,216}, {0,24,48,72,96,120,144,168,192}};
if (lut_numCnInCnGroups[5] > 0)
{
// Number of groups of 32 CNs for parallel processing
// Ceil for values not divisible by 32
M = (lut_numCnInCnGroups[5]*Z + 63)>>6;
// Set the offset to each bit within a group in terms of 32 Byte
bitOffsetInGroup = (lut_numCnInCnGroups_BG2_R15[5]*NR_LDPC_ZMAX)>>6;
// Set pointers to start of group 10
p_cnProcBuf = (__m512i*) &cnProcBuf [lut_startAddrCnGroups[5]];
p_cnProcBufRes = (__m512i*) &cnProcBufRes[lut_startAddrCnGroups[5]];
// Loop over every BN
for (j=0; j<10; j++)
{
// Set of results pointer to correct BN address
p_cnProcBufResBit = p_cnProcBufRes + (j*bitOffsetInGroup);
// Loop over CNs
for (i=0; i<M; i++)
{
// Abs and sign of 32 CNs (first BN)
zmm0 = p_cnProcBuf[(lut_idxCnProcG10[j][0]/2) + i];
sgn = _mm512_xor_si512(*p_ones, zmm0);
min = _mm512_abs_epi8(zmm0);
// Loop over BNs
for (k=1; k<9; k++)
{
zmm0 = p_cnProcBuf[(lut_idxCnProcG10[j][k]/2) + i];
min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0));
sgn = _mm512_xor_si512(sgn, zmm0);
}
// Store result
min = _mm512_min_epu8(min, *p_maxLLR); // 128 in epi8 is -127
*p_cnProcBufResBit = conditional_negate(min, sgn,zeros);
p_cnProcBufResBit++;
}
}
}
}
#include "nrLDPC_cnProc_avx512.h"
#else
......@@ -392,361 +64,13 @@ static inline void nrLDPC_cnProc_BG2(t_nrLDPC_lut* p_lut, int8_t* cnProcBuf, int
__m256i ymm0, min, sgn;
__m256i* p_cnProcBufResBit;
const __m256i* p_ones = (__m256i*) ones256_epi8;
const __m256i* p_maxLLR = (__m256i*) maxLLR256_epi8;
// LUT with offsets for bits that need to be processed
// 1. bit proc requires LLRs of 2. and 3. bit, 2.bits of 1. and 3. etc.
// Offsets are in units of bitOffsetInGroup
const uint8_t lut_idxCnProcG3[3][2] = {{72,144}, {0,144}, {0,72}};
// =====================================================================
// Process group with 3 BNs
if (lut_numCnInCnGroups[0] > 0)
{
// Number of groups of 32 CNs for parallel processing
// Ceil for values not divisible by 32
M = (lut_numCnInCnGroups[0]*Z + 31)>>5;
// Set the offset to each bit within a group in terms of 32 Byte
bitOffsetInGroup = (lut_numCnInCnGroups_BG2_R15[0]*NR_LDPC_ZMAX)>>5;
// Set pointers to start of group 3
p_cnProcBuf = (__m256i*) &cnProcBuf [lut_startAddrCnGroups[0]];
p_cnProcBufRes = (__m256i*) &cnProcBufRes[lut_startAddrCnGroups[0]];
// Loop over every BN
for (j=0; j<3; j++)
{
// Set of results pointer to correct BN address
p_cnProcBufResBit = p_cnProcBufRes + (j*bitOffsetInGroup);
__m256i *pj0 = &p_cnProcBuf[lut_idxCnProcG3[j][0]];
__m256i *pj1 = &p_cnProcBuf[lut_idxCnProcG3[j][1]];
// Loop over CNs
for (i=0; i<M; i++)
{
// Abs and sign of 32 CNs (first BN)
// ymm0 = p_cnProcBuf[lut_idxCnProcG3[j][0] + i];
ymm0 = pj0[i];
sgn = _mm256_sign_epi8(*p_ones, ymm0);
min = _mm256_abs_epi8(ymm0);
// 32 CNs of second BN
// ymm0 = p_cnProcBuf[lut_idxCnProcG3[j][1] + i];
ymm0 = pj1[i];
min = _mm256_min_epu8(min, _mm256_abs_epi8(ymm0));
sgn = _mm256_sign_epi8(sgn, ymm0);
// Store result
min = _mm256_min_epu8(min, *p_maxLLR); // 128 in epi8 is -127
*p_cnProcBufResBit = _mm256_sign_epi8(min, sgn);
p_cnProcBufResBit++;
//p_cnProcBufResBit[i]=_mm256_sign_epi8(min, sgn);
}
}
}
// =====================================================================
// Process group with 4 BNs
// Offset is 20*384/32 = 240
const uint16_t lut_idxCnProcG4[4][3] = {{240,480,720}, {0,480,720}, {0,240,720}, {0,240,480}};
if (lut_numCnInCnGroups[1] > 0)
{
// Number of groups of 32 CNs for parallel processing
// Ceil for values not divisible by 32
M = (lut_numCnInCnGroups[1]*Z + 31)>>5;
// Set the offset to each bit within a group in terms of 32 Byte
bitOffsetInGroup = (lut_numCnInCnGroups_BG2_R15[1]*NR_LDPC_ZMAX)>>5;
// Set pointers to start of group 4
p_cnProcBuf = (__m256i*) &cnProcBuf [lut_startAddrCnGroups[1]];
p_cnProcBufRes = (__m256i*) &cnProcBufRes[lut_startAddrCnGroups[1]];
// Loop over every BN
for (j=0; j<4; j++)
{
// Set of results pointer to correct BN address
p_cnProcBufResBit = p_cnProcBufRes + (j*bitOffsetInGroup);
// Loop over CNs
for (i=0; i<M; i++)
{
// Abs and sign of 32 CNs (first BN)
ymm0 = p_cnProcBuf[lut_idxCnProcG4[j][0] + i];
sgn = _mm256_sign_epi8(*p_ones, ymm0);
min = _mm256_abs_epi8(ymm0);
// Loop over BNs
for (k=1; k<3; k++)
{
ymm0 = p_cnProcBuf[lut_idxCnProcG4[j][k] + i];
min = _mm256_min_epu8(min, _mm256_abs_epi8(ymm0));
sgn = _mm256_sign_epi8(sgn, ymm0);
}
// Store result
min = _mm256_min_epu8(min, *p_maxLLR); // 128 in epi8 is -127
*p_cnProcBufResBit = _mm256_sign_epi8(min, sgn);
p_cnProcBufResBit++;
}
}
}
// =====================================================================
// Process group with 5 BNs
// Offset is 9*384/32 = 108
const uint16_t lut_idxCnProcG5[5][4] = {{108,216,324,432}, {0,216,324,432},
{0,108,324,432}, {0,108,216,432}, {0,108,216,324}};
if (lut_numCnInCnGroups[2] > 0)
{
// Number of groups of 32 CNs for parallel processing
// Ceil for values not divisible by 32
M = (lut_numCnInCnGroups[2]*Z + 31)>>5;
// Set the offset to each bit within a group in terms of 32 Byte
bitOffsetInGroup = (lut_numCnInCnGroups_BG2_R15[2]*NR_LDPC_ZMAX)>>5;
// Set pointers to start of group 5
p_cnProcBuf = (__m256i*) &cnProcBuf [lut_startAddrCnGroups[2]];
p_cnProcBufRes = (__m256i*) &cnProcBufRes[lut_startAddrCnGroups[2]];
// Loop over every BN
for (j=0; j<5; j++)
{
// Set of results pointer to correct BN address
p_cnProcBufResBit = p_cnProcBufRes + (j*bitOffsetInGroup);
// Loop over CNs
for (i=0; i<M; i++)
{
// Abs and sign of 32 CNs (first BN)
ymm0 = p_cnProcBuf[lut_idxCnProcG5[j][0] + i];
sgn = _mm256_sign_epi8(*p_ones, ymm0);
min = _mm256_abs_epi8(ymm0);
// Loop over BNs
for (k=1; k<4; k++)
{
ymm0 = p_cnProcBuf[lut_idxCnProcG5[j][k] + i];
min = _mm256_min_epu8(min, _mm256_abs_epi8(ymm0));
sgn = _mm256_sign_epi8(sgn, ymm0);
}
// Store result
min = _mm256_min_epu8(min, *p_maxLLR); // 128 in epi8 is -127
*p_cnProcBufResBit = _mm256_sign_epi8(min, sgn);
p_cnProcBufResBit++;
}
}
}
// =====================================================================
// Process group with 6 BNs
// Offset is 3*384/32 = 36
const uint16_t lut_idxCnProcG6[6][5] = {{36,72,108,144,180}, {0,72,108,144,180},
{0,36,108,144,180}, {0,36,72,144,180},
{0,36,72,108,180}, {0,36,72,108,144}};
if (lut_numCnInCnGroups[3] > 0)
{
// Number of groups of 32 CNs for parallel processing
// Ceil for values not divisible by 32
M = (lut_numCnInCnGroups[3]*Z + 31)>>5;
// Set the offset to each bit within a group in terms of 32 Byte
bitOffsetInGroup = (lut_numCnInCnGroups_BG2_R15[3]*NR_LDPC_ZMAX)>>5;
// Set pointers to start of group 6
p_cnProcBuf = (__m256i*) &cnProcBuf [lut_startAddrCnGroups[3]];
p_cnProcBufRes = (__m256i*) &cnProcBufRes[lut_startAddrCnGroups[3]];
// Loop over every BN
for (j=0; j<6; j++)
{
// Set of results pointer to correct BN address
p_cnProcBufResBit = p_cnProcBufRes + (j*bitOffsetInGroup);
// Loop over CNs
for (i=0; i<M; i++)
{
// Abs and sign of 32 CNs (first BN)
ymm0 = p_cnProcBuf[lut_idxCnProcG6[j][0] + i];
sgn = _mm256_sign_epi8(*p_ones, ymm0);
min = _mm256_abs_epi8(ymm0);
// Loop over BNs
for (k=1; k<5; k++)
{
ymm0 = p_cnProcBuf[lut_idxCnProcG6[j][k] + i];
min = _mm256_min_epu8(min, _mm256_abs_epi8(ymm0));
sgn = _mm256_sign_epi8(sgn, ymm0);
}
// Store result
min = _mm256_min_epu8(min, *p_maxLLR); // 128 in epi8 is -127
*p_cnProcBufResBit = _mm256_sign_epi8(min, sgn);
p_cnProcBufResBit++;
}
}
}
// =====================================================================
// Process group with 8 BNs
// Offset is 2*384/32 = 24
const uint8_t lut_idxCnProcG8[8][7] = {{24,48,72,96,120,144,168}, {0,48,72,96,120,144,168},
{0,24,72,96,120,144,168}, {0,24,48,96,120,144,168},
{0,24,48,72,120,144,168}, {0,24,48,72,96,144,168},
{0,24,48,72,96,120,168}, {0,24,48,72,96,120,144}};
if (lut_numCnInCnGroups[4] > 0)
{
// Number of groups of 32 CNs for parallel processing
// Ceil for values not divisible by 32
M = (lut_numCnInCnGroups[4]*Z + 31)>>5;
// Set the offset to each bit within a group in terms of 32 Byte
bitOffsetInGroup = (lut_numCnInCnGroups_BG2_R15[4]*NR_LDPC_ZMAX)>>5;
// Set pointers to start of group 8
p_cnProcBuf = (__m256i*) &cnProcBuf [lut_startAddrCnGroups[4]];
p_cnProcBufRes = (__m256i*) &cnProcBufRes[lut_startAddrCnGroups[4]];
// Loop over every BN
for (j=0; j<8; j++)
{
// Set of results pointer to correct BN address
p_cnProcBufResBit = p_cnProcBufRes + (j*bitOffsetInGroup);
// Loop over CNs
for (i=0; i<M; i++)
{
// Abs and sign of 32 CNs (first BN)
ymm0 = p_cnProcBuf[lut_idxCnProcG8[j][0] + i];
sgn = _mm256_sign_epi8(*p_ones, ymm0);
min = _mm256_abs_epi8(ymm0);
// Loop over BNs
for (k=1; k<7; k++)
{
ymm0 = p_cnProcBuf[lut_idxCnProcG8[j][k] + i];
min = _mm256_min_epu8(min, _mm256_abs_epi8(ymm0));
sgn = _mm256_sign_epi8(sgn, ymm0);
}
// Store result
min = _mm256_min_epu8(min, *p_maxLLR); // 128 in epi8 is -127
*p_cnProcBufResBit = _mm256_sign_epi8(min, sgn);
p_cnProcBufResBit++;
}
}
}
// =====================================================================
// Process group with 10 BNs
// Offset is 2*384/32 = 24
const uint8_t lut_idxCnProcG10[10][9] = {{24,48,72,96,120,144,168,192,216}, {0,48,72,96,120,144,168,192,216},
{0,24,72,96,120,144,168,192,216}, {0,24,48,96,120,144,168,192,216},
{0,24,48,72,120,144,168,192,216}, {0,24,48,72,96,144,168,192,216},
{0,24,48,72,96,120,168,192,216}, {0,24,48,72,96,120,144,192,216},
{0,24,48,72,96,120,144,168,216}, {0,24,48,72,96,120,144,168,192}};
if (lut_numCnInCnGroups[5] > 0)
{
// Number of groups of 32 CNs for parallel processing
// Ceil for values not divisible by 32
M = (lut_numCnInCnGroups[5]*Z + 31)>>5;
// Set the offset to each bit within a group in terms of 32 Byte
bitOffsetInGroup = (lut_numCnInCnGroups_BG2_R15[5]*NR_LDPC_ZMAX)>>5;
// Set pointers to start of group 10
p_cnProcBuf = (__m256i*) &cnProcBuf [lut_startAddrCnGroups[5]];
p_cnProcBufRes = (__m256i*) &cnProcBufRes[lut_startAddrCnGroups[5]];
// Loop over every BN
for (j=0; j<10; j++)
{
// Set of results pointer to correct BN address
p_cnProcBufResBit = p_cnProcBufRes + (j*bitOffsetInGroup);
// Loop over CNs
for (i=0; i<M; i++)
{
// Abs and sign of 32 CNs (first BN)
ymm0 = p_cnProcBuf[lut_idxCnProcG10[j][0] + i];
sgn = _mm256_sign_epi8(*p_ones, ymm0);
min = _mm256_abs_epi8(ymm0);
// Loop over BNs
for (k=1; k<9; k++)
{
ymm0 = p_cnProcBuf[lut_idxCnProcG10[j][k] + i];
min = _mm256_min_epu8(min, _mm256_abs_epi8(ymm0));
sgn = _mm256_sign_epi8(sgn, ymm0);
}
// Store result
min = _mm256_min_epu8(min, *p_maxLLR); // 128 in epi8 is -127
*p_cnProcBufResBit = _mm256_sign_epi8(min, sgn);
p_cnProcBufResBit++;
}
}
}
}
#endif
/**
\brief Performs CN processing for BG1 on the CN processing buffer and stores the results in the CN processing results buffer.
\param p_lut Pointer to decoder LUTs
\param Z Lifting size
*/
#ifdef __AVX512BW__
static inline void nrLDPC_cnProc_BG1_AVX512(t_nrLDPC_lut* p_lut, t_nrLDPC_procBuf* p_procBuf, uint16_t Z)
{
const uint8_t* lut_numCnInCnGroups = p_lut->numCnInCnGroups;
const uint32_t* lut_startAddrCnGroups = p_lut->startAddrCnGroups;
int8_t* cnProcBuf = p_procBuf->cnProcBuf;
int8_t* cnProcBufRes = p_procBuf->cnProcBufRes;
__m512i* p_cnProcBuf;
__m512i* p_cnProcBufRes;
// Number of CNs in Groups
uint32_t M;
uint32_t i;
uint32_t j;
uint32_t k;
// Offset to each bit within a group in terms of 32 Byte
uint32_t bitOffsetInGroup;
__m512i zmm0, min, sgn, zeros;
zeros = _mm512_setzero_si512();
// maxLLR = _mm512_set1_epi8((char)127);
__m512i* p_cnProcBufResBit;
const __m512i* p_ones = (__m512i*) ones512_epi8;
const __m512i* p_maxLLR = (__m512i*) maxLLR512_epi8;
const __m256i* p_ones = (__m256i*) ones256_epi8;
const __m256i* p_maxLLR = (__m256i*) maxLLR256_epi8;
// LUT with offsets for bits that need to be processed
// 1. bit proc requires LLRs of 2. and 3. bit, 2.bits of 1. and 3. etc.
// Offsets are in units of bitOffsetInGroup (1*384/32)
const uint8_t lut_idxCnProcG3[3][2] = {{12,24}, {0,24}, {0,12}};
// Offsets are in units of bitOffsetInGroup
const uint8_t lut_idxCnProcG3[3][2] = {{72,144}, {0,144}, {0,72}};
// =====================================================================
// Process group with 3 BNs
......@@ -755,14 +79,13 @@ static inline void nrLDPC_cnProc_BG1_AVX512(t_nrLDPC_lut* p_lut, t_nrLDPC_procBu
{
// Number of groups of 32 CNs for parallel processing
// Ceil for values not divisible by 32
M = (lut_numCnInCnGroups[0]*Z + 63)>>6;
M = (lut_numCnInCnGroups[0]*Z + 31)>>5;
// Set the offset to each bit within a group in terms of 32 Byte
bitOffsetInGroup = (lut_numCnInCnGroups_BG1_R13[0]*NR_LDPC_ZMAX)>>6;
bitOffsetInGroup = (lut_numCnInCnGroups_BG2_R15[0]*NR_LDPC_ZMAX)>>5;
// Set pointers to start of group 3
p_cnProcBuf = (__m512i*) &cnProcBuf [lut_startAddrCnGroups[0]];
p_cnProcBufRes = (__m512i*) &cnProcBufRes[lut_startAddrCnGroups[0]];
p_cnProcBuf = (__m256i*) &cnProcBuf [lut_startAddrCnGroups[0]];
p_cnProcBufRes = (__m256i*) &cnProcBufRes[lut_startAddrCnGroups[0]];
// Loop over every BN
for (j=0; j<3; j++)
......@@ -770,24 +93,29 @@ static inline void nrLDPC_cnProc_BG1_AVX512(t_nrLDPC_lut* p_lut, t_nrLDPC_procBu
// Set of results pointer to correct BN address
p_cnProcBufResBit = p_cnProcBufRes + (j*bitOffsetInGroup);
__m256i *pj0 = &p_cnProcBuf[lut_idxCnProcG3[j][0]];
__m256i *pj1 = &p_cnProcBuf[lut_idxCnProcG3[j][1]];
// Loop over CNs
for (i=0; i<M; i++)
{
// Abs and sign of 32 CNs (first BN)
zmm0 = p_cnProcBuf[(lut_idxCnProcG3[j][0]/2) + i];
sgn = _mm512_xor_si512(*p_ones, zmm0);
min = _mm512_abs_epi8(zmm0);
// ymm0 = p_cnProcBuf[lut_idxCnProcG3[j][0] + i];
ymm0 = pj0[i];
sgn = _mm256_sign_epi8(*p_ones, ymm0);
min = _mm256_abs_epi8(ymm0);
// 32 CNs of second BN
zmm0 = p_cnProcBuf[(lut_idxCnProcG3[j][1]/2) + i];
min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0));
sgn = _mm512_xor_si512(sgn, zmm0);
// ymm0 = p_cnProcBuf[lut_idxCnProcG3[j][1] + i];
ymm0 = pj1[i];
min = _mm256_min_epu8(min, _mm256_abs_epi8(ymm0));
sgn = _mm256_sign_epi8(sgn, ymm0);
// Store result
min = _mm512_min_epu8(min, *p_maxLLR); // 128 in epi8 is -127
*p_cnProcBufResBit = conditional_negate(min, sgn,zeros);
p_cnProcBufResBit++;
min = _mm256_min_epu8(min, *p_maxLLR); // 128 in epi8 is -127
//*p_cnProcBufResBit = _mm256_sign_epi8(min, sgn);
//p_cnProcBufResBit++;
p_cnProcBufResBit[i]=_mm256_sign_epi8(min, sgn);
}
}
}
......@@ -795,21 +123,20 @@ static inline void nrLDPC_cnProc_BG1_AVX512(t_nrLDPC_lut* p_lut, t_nrLDPC_procBu
// =====================================================================
// Process group with 4 BNs
// Offset is 5*384/32 = 60
const uint8_t lut_idxCnProcG4[4][3] = {{60,120,180}, {0,120,180}, {0,60,180}, {0,60,120}};
// Offset is 20*384/32 = 240
const uint16_t lut_idxCnProcG4[4][3] = {{240,480,720}, {0,480,720}, {0,240,720}, {0,240,480}};
if (lut_numCnInCnGroups[1] > 0)
{
// Number of groups of 32 CNs for parallel processing
// Ceil for values not divisible by 32
M = (lut_numCnInCnGroups[1]*Z + 63)>>6;
M = (lut_numCnInCnGroups[1]*Z + 31)>>5;
// Set the offset to each bit within a group in terms of 32 Byte
bitOffsetInGroup = (lut_numCnInCnGroups_BG1_R13[1]*NR_LDPC_ZMAX)>>6;
bitOffsetInGroup = (lut_numCnInCnGroups_BG2_R15[1]*NR_LDPC_ZMAX)>>5;
// Set pointers to start of group 4
p_cnProcBuf = (__m512i*) &cnProcBuf [lut_startAddrCnGroups[1]];
p_cnProcBufRes = (__m512i*) &cnProcBufRes[lut_startAddrCnGroups[1]];
p_cnProcBuf = (__m256i*) &cnProcBuf [lut_startAddrCnGroups[1]];
p_cnProcBufRes = (__m256i*) &cnProcBufRes[lut_startAddrCnGroups[1]];
// Loop over every BN
for (j=0; j<4; j++)
......@@ -821,21 +148,21 @@ static inline void nrLDPC_cnProc_BG1_AVX512(t_nrLDPC_lut* p_lut, t_nrLDPC_procBu
for (i=0; i<M; i++)
{
// Abs and sign of 32 CNs (first BN)
zmm0 = p_cnProcBuf[(lut_idxCnProcG4[j][0]/2) + i];
sgn = _mm512_xor_si512(*p_ones, zmm0);
min = _mm512_abs_epi8(zmm0);
ymm0 = p_cnProcBuf[lut_idxCnProcG4[j][0] + i];
sgn = _mm256_sign_epi8(*p_ones, ymm0);
min = _mm256_abs_epi8(ymm0);
// Loop over BNs
for (k=1; k<3; k++)
{
zmm0 = p_cnProcBuf[(lut_idxCnProcG4[j][k]/2) + i];
min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0));
sgn = _mm512_xor_si512(sgn, zmm0);
ymm0 = p_cnProcBuf[lut_idxCnProcG4[j][k] + i];
min = _mm256_min_epu8(min, _mm256_abs_epi8(ymm0));
sgn = _mm256_sign_epi8(sgn, ymm0);
}
// Store result
min = _mm512_min_epu8(min, *p_maxLLR); // 128 in epi8 is -127
*p_cnProcBufResBit = conditional_negate(min, sgn,zeros);
min = _mm256_min_epu8(min, *p_maxLLR); // 128 in epi8 is -127
*p_cnProcBufResBit = _mm256_sign_epi8(min, sgn);
p_cnProcBufResBit++;
}
}
......@@ -844,22 +171,21 @@ static inline void nrLDPC_cnProc_BG1_AVX512(t_nrLDPC_lut* p_lut, t_nrLDPC_procBu
// =====================================================================
// Process group with 5 BNs
// Offset is 18*384/32 = 216
const uint16_t lut_idxCnProcG5[5][4] = {{216,432,648,864}, {0,432,648,864},
{0,216,648,864}, {0,216,432,864}, {0,216,432,648}};
// Offset is 9*384/32 = 108
const uint16_t lut_idxCnProcG5[5][4] = {{108,216,324,432}, {0,216,324,432},
{0,108,324,432}, {0,108,216,432}, {0,108,216,324}};
if (lut_numCnInCnGroups[2] > 0)
{
// Number of groups of 32 CNs for parallel processing
// Ceil for values not divisible by 32
M = (lut_numCnInCnGroups[2]*Z + 63)>>6;
M = (lut_numCnInCnGroups[2]*Z + 31)>>5;
// Set the offset to each bit within a group in terms of 32 Byte
bitOffsetInGroup = (lut_numCnInCnGroups_BG1_R13[2]*NR_LDPC_ZMAX)>>6;
bitOffsetInGroup = (lut_numCnInCnGroups_BG2_R15[2]*NR_LDPC_ZMAX)>>5;
// Set pointers to start of group 5
p_cnProcBuf = (__m512i*) &cnProcBuf [lut_startAddrCnGroups[2]];
p_cnProcBufRes = (__m512i*) &cnProcBufRes[lut_startAddrCnGroups[2]];
p_cnProcBuf = (__m256i*) &cnProcBuf [lut_startAddrCnGroups[2]];
p_cnProcBufRes = (__m256i*) &cnProcBufRes[lut_startAddrCnGroups[2]];
// Loop over every BN
for (j=0; j<5; j++)
......@@ -871,21 +197,21 @@ static inline void nrLDPC_cnProc_BG1_AVX512(t_nrLDPC_lut* p_lut, t_nrLDPC_procBu
for (i=0; i<M; i++)
{
// Abs and sign of 32 CNs (first BN)
zmm0 = p_cnProcBuf[(lut_idxCnProcG5[j][0]/2) + i];
sgn = _mm512_xor_si512(*p_ones, zmm0);
min = _mm512_abs_epi8(zmm0);
ymm0 = p_cnProcBuf[lut_idxCnProcG5[j][0] + i];
sgn = _mm256_sign_epi8(*p_ones, ymm0);
min = _mm256_abs_epi8(ymm0);
// Loop over BNs
for (k=1; k<4; k++)
{
zmm0 = p_cnProcBuf[(lut_idxCnProcG5[j][k]/2) + i];
min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0));
sgn = _mm512_xor_si512(sgn, zmm0);
ymm0 = p_cnProcBuf[lut_idxCnProcG5[j][k] + i];
min = _mm256_min_epu8(min, _mm256_abs_epi8(ymm0));
sgn = _mm256_sign_epi8(sgn, ymm0);
}
// Store result
min = _mm512_min_epu8(min, *p_maxLLR); // 128 in epi8 is -127
*p_cnProcBufResBit = conditional_negate(min, sgn,zeros);
min = _mm256_min_epu8(min, *p_maxLLR); // 128 in epi8 is -127
*p_cnProcBufResBit = _mm256_sign_epi8(min, sgn);
p_cnProcBufResBit++;
}
}
......@@ -894,23 +220,22 @@ static inline void nrLDPC_cnProc_BG1_AVX512(t_nrLDPC_lut* p_lut, t_nrLDPC_procBu
// =====================================================================
// Process group with 6 BNs
// Offset is 8*384/32 = 96
const uint16_t lut_idxCnProcG6[6][5] = {{96,192,288,384,480}, {0,192,288,384,480},
{0,96,288,384,480}, {0,96,192,384,480},
{0,96,192,288,480}, {0,96,192,288,384}};
// Offset is 3*384/32 = 36
const uint16_t lut_idxCnProcG6[6][5] = {{36,72,108,144,180}, {0,72,108,144,180},
{0,36,108,144,180}, {0,36,72,144,180},
{0,36,72,108,180}, {0,36,72,108,144}};
if (lut_numCnInCnGroups[3] > 0)
{
// Number of groups of 32 CNs for parallel processing
// Ceil for values not divisible by 32
M = (lut_numCnInCnGroups[3]*Z + 63)>>6;
M = (lut_numCnInCnGroups[3]*Z + 31)>>5;
// Set the offset to each bit within a group in terms of 32 Byte
bitOffsetInGroup = (lut_numCnInCnGroups_BG1_R13[3]*NR_LDPC_ZMAX)>>6;
bitOffsetInGroup = (lut_numCnInCnGroups_BG2_R15[3]*NR_LDPC_ZMAX)>>5;
// Set pointers to start of group 6
p_cnProcBuf = (__m512i*) &cnProcBuf [lut_startAddrCnGroups[3]];
p_cnProcBufRes = (__m512i*) &cnProcBufRes[lut_startAddrCnGroups[3]];
p_cnProcBuf = (__m256i*) &cnProcBuf [lut_startAddrCnGroups[3]];
p_cnProcBufRes = (__m256i*) &cnProcBufRes[lut_startAddrCnGroups[3]];
// Loop over every BN
for (j=0; j<6; j++)
......@@ -922,73 +247,21 @@ static inline void nrLDPC_cnProc_BG1_AVX512(t_nrLDPC_lut* p_lut, t_nrLDPC_procBu
for (i=0; i<M; i++)
{
// Abs and sign of 32 CNs (first BN)
zmm0 = p_cnProcBuf[(lut_idxCnProcG6[j][0]/2) + i];
sgn = _mm512_xor_si512(*p_ones, zmm0);
min = _mm512_abs_epi8(zmm0);
ymm0 = p_cnProcBuf[lut_idxCnProcG6[j][0] + i];
sgn = _mm256_sign_epi8(*p_ones, ymm0);
min = _mm256_abs_epi8(ymm0);
// Loop over BNs
for (k=1; k<5; k++)
{
zmm0 = p_cnProcBuf[(lut_idxCnProcG6[j][k]/2) + i];
min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0));
sgn = _mm512_xor_si512(sgn, zmm0);
}
// Store result
min = _mm512_min_epu8(min, *p_maxLLR); // 128 in epi8 is -127
*p_cnProcBufResBit = conditional_negate(min, sgn,zeros);
p_cnProcBufResBit++;
}
}
}
// =====================================================================
// Process group with 7 BNs
// Offset is 5*384/32 = 60
const uint16_t lut_idxCnProcG7[7][6] = {{60,120,180,240,300,360}, {0,120,180,240,300,360},
{0,60,180,240,300,360}, {0,60,120,240,300,360},
{0,60,120,180,300,360}, {0,60,120,180,240,360},
{0,60,120,180,240,300}};
if (lut_numCnInCnGroups[4] > 0)
{
// Number of groups of 32 CNs for parallel processing
// Ceil for values not divisible by 32
M = (lut_numCnInCnGroups[4]*Z + 63)>>6;
// Set the offset to each bit within a group in terms of 32 Byte
bitOffsetInGroup = (lut_numCnInCnGroups_BG1_R13[4]*NR_LDPC_ZMAX)>>6;
// Set pointers to start of group 7
p_cnProcBuf = (__m512i*) &cnProcBuf [lut_startAddrCnGroups[4]];
p_cnProcBufRes = (__m512i*) &cnProcBufRes[lut_startAddrCnGroups[4]];
// Loop over every BN
for (j=0; j<7; j++)
{
// Set of results pointer to correct BN address
p_cnProcBufResBit = p_cnProcBufRes + (j*bitOffsetInGroup);
// Loop over CNs
for (i=0; i<M; i++)
{
// Abs and sign of 32 CNs (first BN)
zmm0 = p_cnProcBuf[(lut_idxCnProcG7[j][0]/2) + i];
sgn = _mm512_xor_si512(*p_ones, zmm0);
min = _mm512_abs_epi8(zmm0);
// Loop over BNs
for (k=1; k<6; k++)
{
zmm0 = p_cnProcBuf[(lut_idxCnProcG7[j][k]/2) + i];
min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0));
sgn = _mm512_xor_si512(sgn, zmm0);
ymm0 = p_cnProcBuf[lut_idxCnProcG6[j][k] + i];
min = _mm256_min_epu8(min, _mm256_abs_epi8(ymm0));
sgn = _mm256_sign_epi8(sgn, ymm0);
}
// Store result
min = _mm512_min_epu8(min, *p_maxLLR); // 128 in epi8 is -127
*p_cnProcBufResBit = conditional_negate(min, sgn,zeros);
min = _mm256_min_epu8(min, *p_maxLLR); // 128 in epi8 is -127
*p_cnProcBufResBit = _mm256_sign_epi8(min, sgn);
p_cnProcBufResBit++;
}
}
......@@ -1003,18 +276,17 @@ static inline void nrLDPC_cnProc_BG1_AVX512(t_nrLDPC_lut* p_lut, t_nrLDPC_procBu
{0,24,48,72,120,144,168}, {0,24,48,72,96,144,168},
{0,24,48,72,96,120,168}, {0,24,48,72,96,120,144}};
if (lut_numCnInCnGroups[5] > 0)
if (lut_numCnInCnGroups[4] > 0)
{
// Number of groups of 32 CNs for parallel processing
// Ceil for values not divisible by 32
M = (lut_numCnInCnGroups[5]*Z + 63)>>6;
M = (lut_numCnInCnGroups[4]*Z + 31)>>5;
// Set the offset to each bit within a group in terms of 32 Byte
bitOffsetInGroup = (lut_numCnInCnGroups_BG1_R13[5]*NR_LDPC_ZMAX)>>6;
bitOffsetInGroup = (lut_numCnInCnGroups_BG2_R15[4]*NR_LDPC_ZMAX)>>5;
// Set pointers to start of group 8
p_cnProcBuf = (__m512i*) &cnProcBuf [lut_startAddrCnGroups[5]];
p_cnProcBufRes = (__m512i*) &cnProcBufRes[lut_startAddrCnGroups[5]];
p_cnProcBuf = (__m256i*) &cnProcBuf [lut_startAddrCnGroups[4]];
p_cnProcBufRes = (__m256i*) &cnProcBufRes[lut_startAddrCnGroups[4]];
// Loop over every BN
for (j=0; j<8; j++)
......@@ -1026,74 +298,21 @@ static inline void nrLDPC_cnProc_BG1_AVX512(t_nrLDPC_lut* p_lut, t_nrLDPC_procBu
for (i=0; i<M; i++)
{
// Abs and sign of 32 CNs (first BN)
zmm0 = p_cnProcBuf[(lut_idxCnProcG8[j][0]/2) + i];
sgn = _mm512_xor_si512(*p_ones, zmm0);
min = _mm512_abs_epi8(zmm0);
ymm0 = p_cnProcBuf[lut_idxCnProcG8[j][0] + i];
sgn = _mm256_sign_epi8(*p_ones, ymm0);
min = _mm256_abs_epi8(ymm0);
// Loop over BNs
for (k=1; k<7; k++)
{
zmm0 = p_cnProcBuf[(lut_idxCnProcG8[j][k]/2) + i];
min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0));
sgn = _mm512_xor_si512(sgn, zmm0);
}
// Store result
min = _mm512_min_epu8(min, *p_maxLLR); // 128 in epi8 is -127
*p_cnProcBufResBit = conditional_negate(min, sgn,zeros);
p_cnProcBufResBit++;
}
}
}
// =====================================================================
// Process group with 9 BNs
// Offset is 2*384/32 = 24
const uint8_t lut_idxCnProcG9[9][8] = {{24,48,72,96,120,144,168,192}, {0,48,72,96,120,144,168,192},
{0,24,72,96,120,144,168,192}, {0,24,48,96,120,144,168,192},
{0,24,48,72,120,144,168,192}, {0,24,48,72,96,144,168,192},
{0,24,48,72,96,120,168,192}, {0,24,48,72,96,120,144,192},
{0,24,48,72,96,120,144,168}};
if (lut_numCnInCnGroups[6] > 0)
{
// Number of groups of 32 CNs for parallel processing
// Ceil for values not divisible by 32
M = (lut_numCnInCnGroups[6]*Z + 63)>>6;
// Set the offset to each bit within a group in terms of 32 Byte
bitOffsetInGroup = (lut_numCnInCnGroups_BG1_R13[6]*NR_LDPC_ZMAX)>>6;
// Set pointers to start of group 9
p_cnProcBuf = (__m512i*) &cnProcBuf [lut_startAddrCnGroups[6]];
p_cnProcBufRes = (__m512i*) &cnProcBufRes[lut_startAddrCnGroups[6]];
// Loop over every BN
for (j=0; j<9; j++)
{
// Set of results pointer to correct BN address
p_cnProcBufResBit = p_cnProcBufRes + (j*bitOffsetInGroup);
// Loop over CNs
for (i=0; i<M; i++)
{
// Abs and sign of 32 CNs (first BN)
zmm0 = p_cnProcBuf[(lut_idxCnProcG9[j][0]/2) + i];
sgn = _mm512_xor_si512(*p_ones, zmm0);
min = _mm512_abs_epi8(zmm0);
// Loop over BNs
for (k=1; k<8; k++)
{
zmm0 = p_cnProcBuf[(lut_idxCnProcG9[j][k]/2) + i];
min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0));
sgn = _mm512_xor_si512(sgn, zmm0);
ymm0 = p_cnProcBuf[lut_idxCnProcG8[j][k] + i];
min = _mm256_min_epu8(min, _mm256_abs_epi8(ymm0));
sgn = _mm256_sign_epi8(sgn, ymm0);
}
// Store result
min = _mm512_min_epu8(min, *p_maxLLR); // 128 in epi8 is -127
*p_cnProcBufResBit = conditional_negate(min, sgn,zeros);
min = _mm256_min_epu8(min, *p_maxLLR); // 128 in epi8 is -127
*p_cnProcBufResBit = _mm256_sign_epi8(min, sgn);
p_cnProcBufResBit++;
}
}
......@@ -1102,25 +321,24 @@ static inline void nrLDPC_cnProc_BG1_AVX512(t_nrLDPC_lut* p_lut, t_nrLDPC_procBu
// =====================================================================
// Process group with 10 BNs
// Offset is 1*384/32 = 12
const uint8_t lut_idxCnProcG10[10][9] = {{12,24,36,48,60,72,84,96,108}, {0,24,36,48,60,72,84,96,108},
{0,12,36,48,60,72,84,96,108}, {0,12,24,48,60,72,84,96,108},
{0,12,24,36,60,72,84,96,108}, {0,12,24,36,48,72,84,96,108},
{0,12,24,36,48,60,84,96,108}, {0,12,24,36,48,60,72,96,108},
{0,12,24,36,48,60,72,84,108}, {0,12,24,36,48,60,72,84,96}};
// Offset is 2*384/32 = 24
const uint8_t lut_idxCnProcG10[10][9] = {{24,48,72,96,120,144,168,192,216}, {0,48,72,96,120,144,168,192,216},
{0,24,72,96,120,144,168,192,216}, {0,24,48,96,120,144,168,192,216},
{0,24,48,72,120,144,168,192,216}, {0,24,48,72,96,144,168,192,216},
{0,24,48,72,96,120,168,192,216}, {0,24,48,72,96,120,144,192,216},
{0,24,48,72,96,120,144,168,216}, {0,24,48,72,96,120,144,168,192}};
if (lut_numCnInCnGroups[7] > 0)
if (lut_numCnInCnGroups[5] > 0)
{
// Number of groups of 32 CNs for parallel processing
// Ceil for values not divisible by 32
M = (lut_numCnInCnGroups[7]*Z + 63)>>6;
M = (lut_numCnInCnGroups[5]*Z + 31)>>5;
// Set the offset to each bit within a group in terms of 32 Byte
bitOffsetInGroup = (lut_numCnInCnGroups_BG1_R13[7]*NR_LDPC_ZMAX)>>6;
bitOffsetInGroup = (lut_numCnInCnGroups_BG2_R15[5]*NR_LDPC_ZMAX)>>5;
// Set pointers to start of group 10
p_cnProcBuf = (__m512i*) &cnProcBuf [lut_startAddrCnGroups[7]];
p_cnProcBufRes = (__m512i*) &cnProcBufRes[lut_startAddrCnGroups[7]];
p_cnProcBuf = (__m256i*) &cnProcBuf [lut_startAddrCnGroups[5]];
p_cnProcBufRes = (__m256i*) &cnProcBufRes[lut_startAddrCnGroups[5]];
// Loop over every BN
for (j=0; j<10; j++)
......@@ -1132,87 +350,37 @@ static inline void nrLDPC_cnProc_BG1_AVX512(t_nrLDPC_lut* p_lut, t_nrLDPC_procBu
for (i=0; i<M; i++)
{
// Abs and sign of 32 CNs (first BN)
zmm0 = p_cnProcBuf[(lut_idxCnProcG10[j][0]/2) + i];
sgn = _mm512_xor_si512(*p_ones, zmm0);
min = _mm512_abs_epi8(zmm0);
ymm0 = p_cnProcBuf[lut_idxCnProcG10[j][0] + i];
sgn = _mm256_sign_epi8(*p_ones, ymm0);
min = _mm256_abs_epi8(ymm0);
// Loop over BNs
for (k=1; k<9; k++)
{
zmm0 = p_cnProcBuf[(lut_idxCnProcG10[j][k]/2) + i];
min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0));
sgn = _mm512_xor_si512(sgn, zmm0);
ymm0 = p_cnProcBuf[lut_idxCnProcG10[j][k] + i];
min = _mm256_min_epu8(min, _mm256_abs_epi8(ymm0));
sgn = _mm256_sign_epi8(sgn, ymm0);
}
// Store result
min = _mm512_min_epu8(min, *p_maxLLR); // 128 in epi8 is -127
*p_cnProcBufResBit = conditional_negate(min, sgn,zeros);
min = _mm256_min_epu8(min, *p_maxLLR); // 128 in epi8 is -127
*p_cnProcBufResBit = _mm256_sign_epi8(min, sgn);
p_cnProcBufResBit++;
}
}
}
// =====================================================================
// Process group with 19 BNs
// Offset is 4*384/32 = 12
const uint16_t lut_idxCnProcG19[19][18] = {{48,96,144,192,240,288,336,384,432,480,528,576,624,672,720,768,816,864}, {0,96,144,192,240,288,336,384,432,480,528,576,624,672,720,768,816,864},
{0,48,144,192,240,288,336,384,432,480,528,576,624,672,720,768,816,864}, {0,48,96,192,240,288,336,384,432,480,528,576,624,672,720,768,816,864},
{0,48,96,144,240,288,336,384,432,480,528,576,624,672,720,768,816,864}, {0,48,96,144,192,288,336,384,432,480,528,576,624,672,720,768,816,864},
{0,48,96,144,192,240,336,384,432,480,528,576,624,672,720,768,816,864}, {0,48,96,144,192,240,288,384,432,480,528,576,624,672,720,768,816,864},
{0,48,96,144,192,240,288,336,432,480,528,576,624,672,720,768,816,864}, {0,48,96,144,192,240,288,336,384,480,528,576,624,672,720,768,816,864},
{0,48,96,144,192,240,288,336,384,432,528,576,624,672,720,768,816,864}, {0,48,96,144,192,240,288,336,384,432,480,576,624,672,720,768,816,864},
{0,48,96,144,192,240,288,336,384,432,480,528,624,672,720,768,816,864}, {0,48,96,144,192,240,288,336,384,432,480,528,576,672,720,768,816,864},
{0,48,96,144,192,240,288,336,384,432,480,528,576,624,720,768,816,864}, {0,48,96,144,192,240,288,336,384,432,480,528,576,624,672,768,816,864},
{0,48,96,144,192,240,288,336,384,432,480,528,576,624,672,720,816,864}, {0,48,96,144,192,240,288,336,384,432,480,528,576,624,672,720,768,864},
{0,48,96,144,192,240,288,336,384,432,480,528,576,624,672,720,768,816}};
if (lut_numCnInCnGroups[8] > 0)
{
// Number of groups of 32 CNs for parallel processing
// Ceil for values not divisible by 32
M = (lut_numCnInCnGroups[8]*Z + 63)>>6;
// Set the offset to each bit within a group in terms of 32 Byte
bitOffsetInGroup = (lut_numCnInCnGroups_BG1_R13[8]*NR_LDPC_ZMAX)>>6;
// Set pointers to start of group 19
p_cnProcBuf = (__m512i*) &cnProcBuf [lut_startAddrCnGroups[8]];
p_cnProcBufRes = (__m512i*) &cnProcBufRes[lut_startAddrCnGroups[8]];
// Loop over every BN
for (j=0; j<19; j++)
{
// Set of results pointer to correct BN address
p_cnProcBufResBit = p_cnProcBufRes + (j*bitOffsetInGroup);
}
// Loop over CNs
for (i=0; i<M; i++)
{
// Abs and sign of 32 CNs (first BN)
zmm0 = p_cnProcBuf[(lut_idxCnProcG19[j][0]/2) + i];
sgn = _mm512_xor_si512(*p_ones, zmm0);
min = _mm512_abs_epi8(zmm0);
// Loop over BNs
for (k=1; k<18; k++)
{
zmm0 = p_cnProcBuf[(lut_idxCnProcG19[j][k]/2) + i];
min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0));
sgn = _mm512_xor_si512(sgn, zmm0);
}
/**
\brief Performs CN processing for BG1 on the CN processing buffer and stores the results in the CN processing results buffer.
\param p_lut Pointer to decoder LUTs
\param Z Lifting size
*/
// Store result
min = _mm512_min_epu8(min, *p_maxLLR); // 128 in epi8 is -127
*p_cnProcBufResBit = conditional_negate(min, sgn,zeros);
p_cnProcBufResBit++;
}
}
}
}
#else
/**
\brief Performs CN processing for BG1 on the CN processing buffer and stores the results in the CN processing results buffer.
\param p_lut Pointer to decoder LUTs
......@@ -1708,6 +876,7 @@ static inline void nrLDPC_cnProc_BG1(t_nrLDPC_lut* p_lut, int8_t* cnProcBuf, int
}
}
#endif
/**
\brief Performs parity check for BG1 on the CN processing buffer. Stops as soon as error is detected.
......
/*
* Licensed to the OpenAirInterface (OAI) Software Alliance under one or more
* contributor license agreements. See the NOTICE file distributed with
......@@ -19,10 +20,10 @@
* contact@openairinterface.org
*/
/*!\file nrLDPC_cnProc.h
/*!\file nrLDPC_cnProc_avx512.h
* \brief Defines the functions for check node processing
* \author Sebastian Wagner (TCL Communications) Email: <mailto:sebastian.wagner@tcl.com>
* \date 30-09-2019
* \date 30-09-2021
* \version 1.0
* \note
* \warning
......@@ -31,22 +32,14 @@
#ifndef __NR_LDPC_CNPROC__H__
#define __NR_LDPC_CNPROC__H__
/**
\brief Performs CN processing for BG2 on the CN processing buffer and stores the results in the CN processing results buffer.
\param p_lut Pointer to decoder LUTs
\param p_procBuf Pointer to processing buffers
\param Z Lifting size
*/
static inline void nrLDPC_cnProc_BG2(t_nrLDPC_lut* p_lut, t_nrLDPC_procBuf* p_procBuf, uint16_t Z)
#define conditional_negate(a,b,z) _mm512_mask_sub_epi8(a,_mm512_movepi8_mask(b),z,a)
static inline void nrLDPC_cnProc_BG2_AVX512(t_nrLDPC_lut* p_lut, int8_t* cnProcBuf, int8_t* cnProcBufRes, uint16_t Z)
{
const uint8_t* lut_numCnInCnGroups = p_lut->numCnInCnGroups;
const uint32_t* lut_startAddrCnGroups = p_lut->startAddrCnGroups;
int8_t* cnProcBuf = p_procBuf->cnProcBuf;
int8_t* cnProcBufRes = p_procBuf->cnProcBufRes;
__m256i* p_cnProcBuf;
__m256i* p_cnProcBufRes;
__m512i* p_cnProcBuf;
__m512i* p_cnProcBufRes;
// Number of CNs in Groups
uint32_t M;
......@@ -56,11 +49,13 @@ static inline void nrLDPC_cnProc_BG2(t_nrLDPC_lut* p_lut, t_nrLDPC_procBuf* p_pr
// Offset to each bit within a group in terms of 32 Byte
uint32_t bitOffsetInGroup;
__m256i ymm0, min, sgn;
__m256i* p_cnProcBufResBit;
__m512i zmm0, min, sgn, zeros;
zeros = _mm512_setzero_si512();
// maxLLR = _mm512_set1_epi8((char)127);
__m512i* p_cnProcBufResBit;
const __m256i* p_ones = (__m256i*) ones256_epi8;
const __m256i* p_maxLLR = (__m256i*) maxLLR256_epi8;
const __m512i* p_ones = (__m512i*) ones512_epi8;
const __m512i* p_maxLLR = (__m512i*) maxLLR512_epi8;
// LUT with offsets for bits that need to be processed
// 1. bit proc requires LLRs of 2. and 3. bit, 2.bits of 1. and 3. etc.
......@@ -74,13 +69,13 @@ static inline void nrLDPC_cnProc_BG2(t_nrLDPC_lut* p_lut, t_nrLDPC_procBuf* p_pr
{
// Number of groups of 32 CNs for parallel processing
// Ceil for values not divisible by 32
M = (lut_numCnInCnGroups[0]*Z + 31)>>5;
M = (lut_numCnInCnGroups[0]*Z + 63)>>6;
// Set the offset to each bit within a group in terms of 32 Byte
bitOffsetInGroup = (lut_numCnInCnGroups_BG2_R15[0]*NR_LDPC_ZMAX)>>5;
bitOffsetInGroup = (lut_numCnInCnGroups_BG2_R15[0]*NR_LDPC_ZMAX)>>6;
// Set pointers to start of group 3
p_cnProcBuf = (__m256i*) &cnProcBuf [lut_startAddrCnGroups[0]];
p_cnProcBufRes = (__m256i*) &cnProcBufRes[lut_startAddrCnGroups[0]];
p_cnProcBuf = (__m512i*) &cnProcBuf [lut_startAddrCnGroups[0]];
p_cnProcBufRes = (__m512i*) &cnProcBufRes[lut_startAddrCnGroups[0]];
// Loop over every BN
for (j=0; j<3; j++)
......@@ -88,29 +83,29 @@ static inline void nrLDPC_cnProc_BG2(t_nrLDPC_lut* p_lut, t_nrLDPC_procBuf* p_pr
// Set of results pointer to correct BN address
p_cnProcBufResBit = p_cnProcBufRes + (j*bitOffsetInGroup);
__m256i *pj0 = &p_cnProcBuf[lut_idxCnProcG3[j][0]];
__m256i *pj1 = &p_cnProcBuf[lut_idxCnProcG3[j][1]];
__m512i *pj0 = &p_cnProcBuf[(lut_idxCnProcG3[j][0]/2)];
__m512i *pj1 = &p_cnProcBuf[(lut_idxCnProcG3[j][1]/2)];
// Loop over CNs
for (i=0; i<M; i++)
{
// Abs and sign of 32 CNs (first BN)
// ymm0 = p_cnProcBuf[lut_idxCnProcG3[j][0] + i];
ymm0 = pj0[i];
sgn = _mm256_sign_epi8(*p_ones, ymm0);
min = _mm256_abs_epi8(ymm0);
// zmm0 = p_cnProcBuf[lut_idxCnProcG3[j][0] + i];
zmm0 = pj0[i];
sgn = _mm512_xor_si512(*p_ones, zmm0);
min = _mm512_abs_epi8(zmm0);
// 32 CNs of second BN
// ymm0 = p_cnProcBuf[lut_idxCnProcG3[j][1] + i];
ymm0 = pj1[i];
min = _mm256_min_epu8(min, _mm256_abs_epi8(ymm0));
sgn = _mm256_sign_epi8(sgn, ymm0);
// zmm0 = p_cnProcBuf[(lut_idxCnProcG3[j][1]/2) + i];
zmm0 = pj1[i];
min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0));
sgn = _mm512_xor_si512(sgn, zmm0);
// Store result
min = _mm256_min_epu8(min, *p_maxLLR); // 128 in epi8 is -127
*p_cnProcBufResBit = _mm256_sign_epi8(min, sgn);
min = _mm512_min_epu8(min, *p_maxLLR); // 128 in epi8 is -127
*p_cnProcBufResBit = conditional_negate(min, sgn,zeros);
p_cnProcBufResBit++;
//p_cnProcBufResBit[i]=_mm256_sign_epi8(min, sgn);
//p_cnProcBufResBit[i]=_mm512_sign_epi8(min, sgn);
}
}
}
......@@ -125,13 +120,13 @@ static inline void nrLDPC_cnProc_BG2(t_nrLDPC_lut* p_lut, t_nrLDPC_procBuf* p_pr
{
// Number of groups of 32 CNs for parallel processing
// Ceil for values not divisible by 32
M = (lut_numCnInCnGroups[1]*Z + 31)>>5;
M = (lut_numCnInCnGroups[1]*Z + 63)>>6;
// Set the offset to each bit within a group in terms of 32 Byte
bitOffsetInGroup = (lut_numCnInCnGroups_BG2_R15[1]*NR_LDPC_ZMAX)>>5;
bitOffsetInGroup = (lut_numCnInCnGroups_BG2_R15[1]*NR_LDPC_ZMAX)>>6;
// Set pointers to start of group 4
p_cnProcBuf = (__m256i*) &cnProcBuf [lut_startAddrCnGroups[1]];
p_cnProcBufRes = (__m256i*) &cnProcBufRes[lut_startAddrCnGroups[1]];
p_cnProcBuf = (__m512i*) &cnProcBuf [lut_startAddrCnGroups[1]];
p_cnProcBufRes = (__m512i*) &cnProcBufRes[lut_startAddrCnGroups[1]];
// Loop over every BN
for (j=0; j<4; j++)
......@@ -143,21 +138,21 @@ static inline void nrLDPC_cnProc_BG2(t_nrLDPC_lut* p_lut, t_nrLDPC_procBuf* p_pr
for (i=0; i<M; i++)
{
// Abs and sign of 32 CNs (first BN)
ymm0 = p_cnProcBuf[lut_idxCnProcG4[j][0] + i];
sgn = _mm256_sign_epi8(*p_ones, ymm0);
min = _mm256_abs_epi8(ymm0);
zmm0 = p_cnProcBuf[(lut_idxCnProcG4[j][0]/2) + i];
sgn = _mm512_xor_si512(*p_ones, zmm0);
min = _mm512_abs_epi8(zmm0);
// Loop over BNs
for (k=1; k<3; k++)
{
ymm0 = p_cnProcBuf[lut_idxCnProcG4[j][k] + i];
min = _mm256_min_epu8(min, _mm256_abs_epi8(ymm0));
sgn = _mm256_sign_epi8(sgn, ymm0);
zmm0 = p_cnProcBuf[(lut_idxCnProcG4[j][k]/2) + i];
min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0));
sgn = _mm512_xor_si512(sgn, zmm0);
}
// Store result
min = _mm256_min_epu8(min, *p_maxLLR); // 128 in epi8 is -127
*p_cnProcBufResBit = _mm256_sign_epi8(min, sgn);
min = _mm512_min_epu8(min, *p_maxLLR); // 128 in epi8 is -127
*p_cnProcBufResBit = conditional_negate(min, sgn,zeros);
p_cnProcBufResBit++;
}
}
......@@ -174,13 +169,13 @@ static inline void nrLDPC_cnProc_BG2(t_nrLDPC_lut* p_lut, t_nrLDPC_procBuf* p_pr
{
// Number of groups of 32 CNs for parallel processing
// Ceil for values not divisible by 32
M = (lut_numCnInCnGroups[2]*Z + 31)>>5;
M = (lut_numCnInCnGroups[2]*Z + 63)>>6;
// Set the offset to each bit within a group in terms of 32 Byte
bitOffsetInGroup = (lut_numCnInCnGroups_BG2_R15[2]*NR_LDPC_ZMAX)>>5;
bitOffsetInGroup = (lut_numCnInCnGroups_BG2_R15[2]*NR_LDPC_ZMAX)>>6;
// Set pointers to start of group 5
p_cnProcBuf = (__m256i*) &cnProcBuf [lut_startAddrCnGroups[2]];
p_cnProcBufRes = (__m256i*) &cnProcBufRes[lut_startAddrCnGroups[2]];
p_cnProcBuf = (__m512i*) &cnProcBuf [lut_startAddrCnGroups[2]];
p_cnProcBufRes = (__m512i*) &cnProcBufRes[lut_startAddrCnGroups[2]];
// Loop over every BN
for (j=0; j<5; j++)
......@@ -192,21 +187,21 @@ static inline void nrLDPC_cnProc_BG2(t_nrLDPC_lut* p_lut, t_nrLDPC_procBuf* p_pr
for (i=0; i<M; i++)
{
// Abs and sign of 32 CNs (first BN)
ymm0 = p_cnProcBuf[lut_idxCnProcG5[j][0] + i];
sgn = _mm256_sign_epi8(*p_ones, ymm0);
min = _mm256_abs_epi8(ymm0);
zmm0 = p_cnProcBuf[(lut_idxCnProcG5[j][0]/2) + i];
sgn = _mm512_xor_si512(*p_ones, zmm0);
min = _mm512_abs_epi8(zmm0);
// Loop over BNs
for (k=1; k<4; k++)
{
ymm0 = p_cnProcBuf[lut_idxCnProcG5[j][k] + i];
min = _mm256_min_epu8(min, _mm256_abs_epi8(ymm0));
sgn = _mm256_sign_epi8(sgn, ymm0);
zmm0 = p_cnProcBuf[(lut_idxCnProcG5[j][k]/2) + i];
min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0));
sgn = _mm512_xor_si512(sgn, zmm0);
}
// Store result
min = _mm256_min_epu8(min, *p_maxLLR); // 128 in epi8 is -127
*p_cnProcBufResBit = _mm256_sign_epi8(min, sgn);
min = _mm512_min_epu8(min, *p_maxLLR); // 128 in epi8 is -127
*p_cnProcBufResBit = conditional_negate(min, sgn,zeros);
p_cnProcBufResBit++;
}
}
......@@ -224,13 +219,13 @@ static inline void nrLDPC_cnProc_BG2(t_nrLDPC_lut* p_lut, t_nrLDPC_procBuf* p_pr
{
// Number of groups of 32 CNs for parallel processing
// Ceil for values not divisible by 32
M = (lut_numCnInCnGroups[3]*Z + 31)>>5;
M = (lut_numCnInCnGroups[3]*Z + 63)>>6;
// Set the offset to each bit within a group in terms of 32 Byte
bitOffsetInGroup = (lut_numCnInCnGroups_BG2_R15[3]*NR_LDPC_ZMAX)>>5;
bitOffsetInGroup = (lut_numCnInCnGroups_BG2_R15[3]*NR_LDPC_ZMAX)>>6;
// Set pointers to start of group 6
p_cnProcBuf = (__m256i*) &cnProcBuf [lut_startAddrCnGroups[3]];
p_cnProcBufRes = (__m256i*) &cnProcBufRes[lut_startAddrCnGroups[3]];
p_cnProcBuf = (__m512i*) &cnProcBuf [lut_startAddrCnGroups[3]];
p_cnProcBufRes = (__m512i*) &cnProcBufRes[lut_startAddrCnGroups[3]];
// Loop over every BN
for (j=0; j<6; j++)
......@@ -242,21 +237,21 @@ static inline void nrLDPC_cnProc_BG2(t_nrLDPC_lut* p_lut, t_nrLDPC_procBuf* p_pr
for (i=0; i<M; i++)
{
// Abs and sign of 32 CNs (first BN)
ymm0 = p_cnProcBuf[lut_idxCnProcG6[j][0] + i];
sgn = _mm256_sign_epi8(*p_ones, ymm0);
min = _mm256_abs_epi8(ymm0);
zmm0 = p_cnProcBuf[(lut_idxCnProcG6[j][0]/2) + i];
sgn = _mm512_xor_si512(*p_ones, zmm0);
min = _mm512_abs_epi8(zmm0);
// Loop over BNs
for (k=1; k<5; k++)
{
ymm0 = p_cnProcBuf[lut_idxCnProcG6[j][k] + i];
min = _mm256_min_epu8(min, _mm256_abs_epi8(ymm0));
sgn = _mm256_sign_epi8(sgn, ymm0);
zmm0 = p_cnProcBuf[(lut_idxCnProcG6[j][k]/2) + i];
min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0));
sgn = _mm512_xor_si512(sgn, zmm0);
}
// Store result
min = _mm256_min_epu8(min, *p_maxLLR); // 128 in epi8 is -127
*p_cnProcBufResBit = _mm256_sign_epi8(min, sgn);
min = _mm512_min_epu8(min, *p_maxLLR); // 128 in epi8 is -127
*p_cnProcBufResBit = conditional_negate(min, sgn,zeros);
p_cnProcBufResBit++;
}
}
......@@ -275,13 +270,13 @@ static inline void nrLDPC_cnProc_BG2(t_nrLDPC_lut* p_lut, t_nrLDPC_procBuf* p_pr
{
// Number of groups of 32 CNs for parallel processing
// Ceil for values not divisible by 32
M = (lut_numCnInCnGroups[4]*Z + 31)>>5;
M = (lut_numCnInCnGroups[4]*Z + 63)>>6;
// Set the offset to each bit within a group in terms of 32 Byte
bitOffsetInGroup = (lut_numCnInCnGroups_BG2_R15[4]*NR_LDPC_ZMAX)>>5;
bitOffsetInGroup = (lut_numCnInCnGroups_BG2_R15[4]*NR_LDPC_ZMAX)>>6;
// Set pointers to start of group 8
p_cnProcBuf = (__m256i*) &cnProcBuf [lut_startAddrCnGroups[4]];
p_cnProcBufRes = (__m256i*) &cnProcBufRes[lut_startAddrCnGroups[4]];
p_cnProcBuf = (__m512i*) &cnProcBuf [lut_startAddrCnGroups[4]];
p_cnProcBufRes = (__m512i*) &cnProcBufRes[lut_startAddrCnGroups[4]];
// Loop over every BN
for (j=0; j<8; j++)
......@@ -293,21 +288,21 @@ static inline void nrLDPC_cnProc_BG2(t_nrLDPC_lut* p_lut, t_nrLDPC_procBuf* p_pr
for (i=0; i<M; i++)
{
// Abs and sign of 32 CNs (first BN)
ymm0 = p_cnProcBuf[lut_idxCnProcG8[j][0] + i];
sgn = _mm256_sign_epi8(*p_ones, ymm0);
min = _mm256_abs_epi8(ymm0);
zmm0 = p_cnProcBuf[(lut_idxCnProcG8[j][0]/2) + i];
sgn = _mm512_xor_si512(*p_ones, zmm0);
min = _mm512_abs_epi8(zmm0);
// Loop over BNs
for (k=1; k<7; k++)
{
ymm0 = p_cnProcBuf[lut_idxCnProcG8[j][k] + i];
min = _mm256_min_epu8(min, _mm256_abs_epi8(ymm0));
sgn = _mm256_sign_epi8(sgn, ymm0);
zmm0 = p_cnProcBuf[(lut_idxCnProcG8[j][k]/2) + i];
min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0));
sgn = _mm512_xor_si512(sgn, zmm0);
}
// Store result
min = _mm256_min_epu8(min, *p_maxLLR); // 128 in epi8 is -127
*p_cnProcBufResBit = _mm256_sign_epi8(min, sgn);
min = _mm512_min_epu8(min, *p_maxLLR); // 128 in epi8 is -127
*p_cnProcBufResBit = conditional_negate(min, sgn,zeros);
p_cnProcBufResBit++;
}
}
......@@ -327,13 +322,13 @@ static inline void nrLDPC_cnProc_BG2(t_nrLDPC_lut* p_lut, t_nrLDPC_procBuf* p_pr
{
// Number of groups of 32 CNs for parallel processing
// Ceil for values not divisible by 32
M = (lut_numCnInCnGroups[5]*Z + 31)>>5;
M = (lut_numCnInCnGroups[5]*Z + 63)>>6;
// Set the offset to each bit within a group in terms of 32 Byte
bitOffsetInGroup = (lut_numCnInCnGroups_BG2_R15[5]*NR_LDPC_ZMAX)>>5;
bitOffsetInGroup = (lut_numCnInCnGroups_BG2_R15[5]*NR_LDPC_ZMAX)>>6;
// Set pointers to start of group 10
p_cnProcBuf = (__m256i*) &cnProcBuf [lut_startAddrCnGroups[5]];
p_cnProcBufRes = (__m256i*) &cnProcBufRes[lut_startAddrCnGroups[5]];
p_cnProcBuf = (__m512i*) &cnProcBuf [lut_startAddrCnGroups[5]];
p_cnProcBufRes = (__m512i*) &cnProcBufRes[lut_startAddrCnGroups[5]];
// Loop over every BN
for (j=0; j<10; j++)
......@@ -345,21 +340,21 @@ static inline void nrLDPC_cnProc_BG2(t_nrLDPC_lut* p_lut, t_nrLDPC_procBuf* p_pr
for (i=0; i<M; i++)
{
// Abs and sign of 32 CNs (first BN)
ymm0 = p_cnProcBuf[lut_idxCnProcG10[j][0] + i];
sgn = _mm256_sign_epi8(*p_ones, ymm0);
min = _mm256_abs_epi8(ymm0);
zmm0 = p_cnProcBuf[(lut_idxCnProcG10[j][0]/2) + i];
sgn = _mm512_xor_si512(*p_ones, zmm0);
min = _mm512_abs_epi8(zmm0);
// Loop over BNs
for (k=1; k<9; k++)
{
ymm0 = p_cnProcBuf[lut_idxCnProcG10[j][k] + i];
min = _mm256_min_epu8(min, _mm256_abs_epi8(ymm0));
sgn = _mm256_sign_epi8(sgn, ymm0);
zmm0 = p_cnProcBuf[(lut_idxCnProcG10[j][k]/2) + i];
min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0));
sgn = _mm512_xor_si512(sgn, zmm0);
}
// Store result
min = _mm256_min_epu8(min, *p_maxLLR); // 128 in epi8 is -127
*p_cnProcBufResBit = _mm256_sign_epi8(min, sgn);
min = _mm512_min_epu8(min, *p_maxLLR); // 128 in epi8 is -127
*p_cnProcBufResBit = conditional_negate(min, sgn,zeros);
p_cnProcBufResBit++;
}
}
......@@ -367,15 +362,6 @@ static inline void nrLDPC_cnProc_BG2(t_nrLDPC_lut* p_lut, t_nrLDPC_procBuf* p_pr
}
/**
\brief Performs CN processing for BG1 on the CN processing buffer and stores the results in the CN processing results buffer.
\param p_lut Pointer to decoder LUTs
\param Z Lifting size
*/
#ifdef __AVX512BW__
#define conditional_negate(a,b,z) _mm512_mask_sub_epi8(a,_mm512_movepi8_mask(b),z,a)
static inline void nrLDPC_cnProc_BG1_AVX512(t_nrLDPC_lut* p_lut, t_nrLDPC_procBuf* p_procBuf, uint16_t Z)
{
const uint8_t* lut_numCnInCnGroups = p_lut->numCnInCnGroups;
......@@ -383,7 +369,7 @@ static inline void nrLDPC_cnProc_BG1_AVX512(t_nrLDPC_lut* p_lut, t_nrLDPC_procBu
int8_t* cnProcBuf = p_procBuf->cnProcBuf;
int8_t* cnProcBufRes = p_procBuf->cnProcBufRes;
__m512i* p_cnProcBuf;
__m512i* p_cnProcBufRes;
......@@ -395,16 +381,16 @@ static inline void nrLDPC_cnProc_BG1_AVX512(t_nrLDPC_lut* p_lut, t_nrLDPC_procBu
// Offset to each bit within a group in terms of 32 Byte
uint32_t bitOffsetInGroup;
__m512i zmm0, min, sgn, zeros,maxLLR;
__m512i zmm0, min, sgn, zeros;
zeros = _mm512_setzero_si512();
maxLLR = _mm512_set1_epi8((char)127);
// maxLLR = _mm512_set1_epi8((char)127);
__m512i* p_cnProcBufResBit;
const __m512i* p_ones = (__m512i*) ones512_epi8;
const __m512i* p_maxLLR = (__m512i*) maxLLR512_epi8;
......@@ -440,14 +426,14 @@ static inline void nrLDPC_cnProc_BG1_AVX512(t_nrLDPC_lut* p_lut, t_nrLDPC_procBu
{
// Abs and sign of 32 CNs (first BN)
zmm0 = p_cnProcBuf[(lut_idxCnProcG3[j][0]/2) + i];
sgn = _mm512_xor_si512(*p_ones, zmm0);
sgn = _mm512_xor_si512(*p_ones, zmm0);
min = _mm512_abs_epi8(zmm0);
// 32 CNs of second BN
zmm0 = p_cnProcBuf[(lut_idxCnProcG3[j][1]/2) + i];
min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0));
sgn = _mm512_xor_si512(sgn, zmm0);
// Store result
min = _mm512_min_epu8(min, *p_maxLLR); // 128 in epi8 is -127
......@@ -876,1569 +862,4 @@ static inline void nrLDPC_cnProc_BG1_AVX512(t_nrLDPC_lut* p_lut, t_nrLDPC_procBu
}
}
#else
static inline void nrLDPC_cnProc_BG1(t_nrLDPC_lut* p_lut, t_nrLDPC_procBuf* p_procBuf, uint16_t Z)
{
const uint8_t* lut_numCnInCnGroups = p_lut->numCnInCnGroups;
const uint32_t* lut_startAddrCnGroups = p_lut->startAddrCnGroups;
int8_t* cnProcBuf = p_procBuf->cnProcBuf;
int8_t* cnProcBufRes = p_procBuf->cnProcBufRes;
__m256i* p_cnProcBuf;
__m256i* p_cnProcBufRes;
// Number of CNs in Groups
uint32_t M;
uint32_t i;
uint32_t j;
uint32_t k;
// Offset to each bit within a group in terms of 32 Byte
uint32_t bitOffsetInGroup;
__m256i ymm0, min, sgn;
__m256i* p_cnProcBufResBit;
const __m256i* p_ones = (__m256i*) ones256_epi8;
const __m256i* p_maxLLR = (__m256i*) maxLLR256_epi8;
// LUT with offsets for bits that need to be processed
// 1. bit proc requires LLRs of 2. and 3. bit, 2.bits of 1. and 3. etc.
// Offsets are in units of bitOffsetInGroup (1*384/32)
const uint8_t lut_idxCnProcG3[3][2] = {{12,24}, {0,24}, {0,12}};
// =====================================================================
// Process group with 3 BNs
if (lut_numCnInCnGroups[0] > 0)
{
// Number of groups of 32 CNs for parallel processing
// Ceil for values not divisible by 32
M = (lut_numCnInCnGroups[0]*Z + 31)>>5;
// Set the offset to each bit within a group in terms of 32 Byte
bitOffsetInGroup = (lut_numCnInCnGroups_BG1_R13[0]*NR_LDPC_ZMAX)>>5;
// Set pointers to start of group 3
p_cnProcBuf = (__m256i*) &cnProcBuf [lut_startAddrCnGroups[0]];
p_cnProcBufRes = (__m256i*) &cnProcBufRes[lut_startAddrCnGroups[0]];
// Loop over every BN
for (j=0; j<3; j++)
{
// Set of results pointer to correct BN address
p_cnProcBufResBit = p_cnProcBufRes + (j*bitOffsetInGroup);
// Loop over CNs
for (i=0; i<M; i++)
{
// Abs and sign of 32 CNs (first BN)
ymm0 = p_cnProcBuf[lut_idxCnProcG3[j][0] + i];
sgn = _mm256_sign_epi8(*p_ones, ymm0);
min = _mm256_abs_epi8(ymm0);
// 32 CNs of second BN
ymm0 = p_cnProcBuf[lut_idxCnProcG3[j][1] + i];
min = _mm256_min_epu8(min, _mm256_abs_epi8(ymm0));
sgn = _mm256_sign_epi8(sgn, ymm0);
// Store result
min = _mm256_min_epu8(min, *p_maxLLR); // 128 in epi8 is -127
*p_cnProcBufResBit = _mm256_sign_epi8(min, sgn);
p_cnProcBufResBit++;
}
}
}
// =====================================================================
// Process group with 4 BNs
// Offset is 5*384/32 = 60
const uint8_t lut_idxCnProcG4[4][3] = {{60,120,180}, {0,120,180}, {0,60,180}, {0,60,120}};
if (lut_numCnInCnGroups[1] > 0)
{
// Number of groups of 32 CNs for parallel processing
// Ceil for values not divisible by 32
M = (lut_numCnInCnGroups[1]*Z + 31)>>5;
// Set the offset to each bit within a group in terms of 32 Byte
bitOffsetInGroup = (lut_numCnInCnGroups_BG1_R13[1]*NR_LDPC_ZMAX)>>5;
// Set pointers to start of group 4
p_cnProcBuf = (__m256i*) &cnProcBuf [lut_startAddrCnGroups[1]];
p_cnProcBufRes = (__m256i*) &cnProcBufRes[lut_startAddrCnGroups[1]];
// Loop over every BN
for (j=0; j<4; j++)
{
// Set of results pointer to correct BN address
p_cnProcBufResBit = p_cnProcBufRes + (j*bitOffsetInGroup);
// Loop over CNs
for (i=0; i<M; i++)
{
// Abs and sign of 32 CNs (first BN)
ymm0 = p_cnProcBuf[lut_idxCnProcG4[j][0] + i];
sgn = _mm256_sign_epi8(*p_ones, ymm0);
min = _mm256_abs_epi8(ymm0);
// Loop over BNs
for (k=1; k<3; k++)
{
ymm0 = p_cnProcBuf[lut_idxCnProcG4[j][k] + i];
min = _mm256_min_epu8(min, _mm256_abs_epi8(ymm0));
sgn = _mm256_sign_epi8(sgn, ymm0);
}
// Store result
min = _mm256_min_epu8(min, *p_maxLLR); // 128 in epi8 is -127
*p_cnProcBufResBit = _mm256_sign_epi8(min, sgn);
p_cnProcBufResBit++;
}
}
}
// =====================================================================
// Process group with 5 BNs
// Offset is 18*384/32 = 216
const uint16_t lut_idxCnProcG5[5][4] = {{216,432,648,864}, {0,432,648,864},
{0,216,648,864}, {0,216,432,864}, {0,216,432,648}};
if (lut_numCnInCnGroups[2] > 0)
{
// Number of groups of 32 CNs for parallel processing
// Ceil for values not divisible by 32
M = (lut_numCnInCnGroups[2]*Z + 31)>>5;
// Set the offset to each bit within a group in terms of 32 Byte
bitOffsetInGroup = (lut_numCnInCnGroups_BG1_R13[2]*NR_LDPC_ZMAX)>>5;
// Set pointers to start of group 5
p_cnProcBuf = (__m256i*) &cnProcBuf [lut_startAddrCnGroups[2]];
p_cnProcBufRes = (__m256i*) &cnProcBufRes[lut_startAddrCnGroups[2]];
// Loop over every BN
for (j=0; j<5; j++)
{
// Set of results pointer to correct BN address
p_cnProcBufResBit = p_cnProcBufRes + (j*bitOffsetInGroup);
// Loop over CNs
for (i=0; i<M; i++)
{
// Abs and sign of 32 CNs (first BN)
ymm0 = p_cnProcBuf[lut_idxCnProcG5[j][0] + i];
sgn = _mm256_sign_epi8(*p_ones, ymm0);
min = _mm256_abs_epi8(ymm0);
// Loop over BNs
for (k=1; k<4; k++)
{
ymm0 = p_cnProcBuf[lut_idxCnProcG5[j][k] + i];
min = _mm256_min_epu8(min, _mm256_abs_epi8(ymm0));
sgn = _mm256_sign_epi8(sgn, ymm0);
}
// Store result
min = _mm256_min_epu8(min, *p_maxLLR); // 128 in epi8 is -127
*p_cnProcBufResBit = _mm256_sign_epi8(min, sgn);
p_cnProcBufResBit++;
}
}
}
// =====================================================================
// Process group with 6 BNs
// Offset is 8*384/32 = 96
const uint16_t lut_idxCnProcG6[6][5] = {{96,192,288,384,480}, {0,192,288,384,480},
{0,96,288,384,480}, {0,96,192,384,480},
{0,96,192,288,480}, {0,96,192,288,384}};
if (lut_numCnInCnGroups[3] > 0)
{
// Number of groups of 32 CNs for parallel processing
// Ceil for values not divisible by 32
M = (lut_numCnInCnGroups[3]*Z + 31)>>5;
// Set the offset to each bit within a group in terms of 32 Byte
bitOffsetInGroup = (lut_numCnInCnGroups_BG1_R13[3]*NR_LDPC_ZMAX)>>5;
// Set pointers to start of group 6
p_cnProcBuf = (__m256i*) &cnProcBuf [lut_startAddrCnGroups[3]];
p_cnProcBufRes = (__m256i*) &cnProcBufRes[lut_startAddrCnGroups[3]];
// Loop over every BN
for (j=0; j<6; j++)
{
// Set of results pointer to correct BN address
p_cnProcBufResBit = p_cnProcBufRes + (j*bitOffsetInGroup);
// Loop over CNs
for (i=0; i<M; i++)
{
// Abs and sign of 32 CNs (first BN)
ymm0 = p_cnProcBuf[lut_idxCnProcG6[j][0] + i];
sgn = _mm256_sign_epi8(*p_ones, ymm0);
min = _mm256_abs_epi8(ymm0);
// Loop over BNs
for (k=1; k<5; k++)
{
ymm0 = p_cnProcBuf[lut_idxCnProcG6[j][k] + i];
min = _mm256_min_epu8(min, _mm256_abs_epi8(ymm0));
sgn = _mm256_sign_epi8(sgn, ymm0);
}
// Store result
min = _mm256_min_epu8(min, *p_maxLLR); // 128 in epi8 is -127
*p_cnProcBufResBit = _mm256_sign_epi8(min, sgn);
p_cnProcBufResBit++;
}
}
}
// =====================================================================
// Process group with 7 BNs
// Offset is 5*384/32 = 60
const uint16_t lut_idxCnProcG7[7][6] = {{60,120,180,240,300,360}, {0,120,180,240,300,360},
{0,60,180,240,300,360}, {0,60,120,240,300,360},
{0,60,120,180,300,360}, {0,60,120,180,240,360},
{0,60,120,180,240,300}};
if (lut_numCnInCnGroups[4] > 0)
{
// Number of groups of 32 CNs for parallel processing
// Ceil for values not divisible by 32
M = (lut_numCnInCnGroups[4]*Z + 31)>>5;
// Set the offset to each bit within a group in terms of 32 Byte
bitOffsetInGroup = (lut_numCnInCnGroups_BG1_R13[4]*NR_LDPC_ZMAX)>>5;
// Set pointers to start of group 7
p_cnProcBuf = (__m256i*) &cnProcBuf [lut_startAddrCnGroups[4]];
p_cnProcBufRes = (__m256i*) &cnProcBufRes[lut_startAddrCnGroups[4]];
// Loop over every BN
for (j=0; j<7; j++)
{
// Set of results pointer to correct BN address
p_cnProcBufResBit = p_cnProcBufRes + (j*bitOffsetInGroup);
// Loop over CNs
for (i=0; i<M; i++)
{
// Abs and sign of 32 CNs (first BN)
ymm0 = p_cnProcBuf[lut_idxCnProcG7[j][0] + i];
sgn = _mm256_sign_epi8(*p_ones, ymm0);
min = _mm256_abs_epi8(ymm0);
// Loop over BNs
for (k=1; k<6; k++)
{
ymm0 = p_cnProcBuf[lut_idxCnProcG7[j][k] + i];
min = _mm256_min_epu8(min, _mm256_abs_epi8(ymm0));
sgn = _mm256_sign_epi8(sgn, ymm0);
}
// Store result
min = _mm256_min_epu8(min, *p_maxLLR); // 128 in epi8 is -127
*p_cnProcBufResBit = _mm256_sign_epi8(min, sgn);
p_cnProcBufResBit++;
}
}
}
// =====================================================================
// Process group with 8 BNs
// Offset is 2*384/32 = 24
const uint8_t lut_idxCnProcG8[8][7] = {{24,48,72,96,120,144,168}, {0,48,72,96,120,144,168},
{0,24,72,96,120,144,168}, {0,24,48,96,120,144,168},
{0,24,48,72,120,144,168}, {0,24,48,72,96,144,168},
{0,24,48,72,96,120,168}, {0,24,48,72,96,120,144}};
if (lut_numCnInCnGroups[5] > 0)
{
// Number of groups of 32 CNs for parallel processing
// Ceil for values not divisible by 32
M = (lut_numCnInCnGroups[5]*Z + 31)>>5;
// Set the offset to each bit within a group in terms of 32 Byte
bitOffsetInGroup = (lut_numCnInCnGroups_BG1_R13[5]*NR_LDPC_ZMAX)>>5;
// Set pointers to start of group 8
p_cnProcBuf = (__m256i*) &cnProcBuf [lut_startAddrCnGroups[5]];
p_cnProcBufRes = (__m256i*) &cnProcBufRes[lut_startAddrCnGroups[5]];
// Loop over every BN
for (j=0; j<8; j++)
{
// Set of results pointer to correct BN address
p_cnProcBufResBit = p_cnProcBufRes + (j*bitOffsetInGroup);
// Loop over CNs
for (i=0; i<M; i++)
{
// Abs and sign of 32 CNs (first BN)
ymm0 = p_cnProcBuf[lut_idxCnProcG8[j][0] + i];
sgn = _mm256_sign_epi8(*p_ones, ymm0);
min = _mm256_abs_epi8(ymm0);
// Loop over BNs
for (k=1; k<7; k++)
{
ymm0 = p_cnProcBuf[lut_idxCnProcG8[j][k] + i];
min = _mm256_min_epu8(min, _mm256_abs_epi8(ymm0));
sgn = _mm256_sign_epi8(sgn, ymm0);
}
// Store result
min = _mm256_min_epu8(min, *p_maxLLR); // 128 in epi8 is -127
*p_cnProcBufResBit = _mm256_sign_epi8(min, sgn);
p_cnProcBufResBit++;
}
}
}
// =====================================================================
// Process group with 9 BNs
// Offset is 2*384/32 = 24
const uint8_t lut_idxCnProcG9[9][8] = {{24,48,72,96,120,144,168,192}, {0,48,72,96,120,144,168,192},
{0,24,72,96,120,144,168,192}, {0,24,48,96,120,144,168,192},
{0,24,48,72,120,144,168,192}, {0,24,48,72,96,144,168,192},
{0,24,48,72,96,120,168,192}, {0,24,48,72,96,120,144,192},
{0,24,48,72,96,120,144,168}};
if (lut_numCnInCnGroups[6] > 0)
{
// Number of groups of 32 CNs for parallel processing
// Ceil for values not divisible by 32
M = (lut_numCnInCnGroups[6]*Z + 31)>>5;
// Set the offset to each bit within a group in terms of 32 Byte
bitOffsetInGroup = (lut_numCnInCnGroups_BG1_R13[6]*NR_LDPC_ZMAX)>>5;
// Set pointers to start of group 9
p_cnProcBuf = (__m256i*) &cnProcBuf [lut_startAddrCnGroups[6]];
p_cnProcBufRes = (__m256i*) &cnProcBufRes[lut_startAddrCnGroups[6]];
// Loop over every BN
for (j=0; j<9; j++)
{
// Set of results pointer to correct BN address
p_cnProcBufResBit = p_cnProcBufRes + (j*bitOffsetInGroup);
// Loop over CNs
for (i=0; i<M; i++)
{
// Abs and sign of 32 CNs (first BN)
ymm0 = p_cnProcBuf[lut_idxCnProcG9[j][0] + i];
sgn = _mm256_sign_epi8(*p_ones, ymm0);
min = _mm256_abs_epi8(ymm0);
// Loop over BNs
for (k=1; k<8; k++)
{
ymm0 = p_cnProcBuf[lut_idxCnProcG9[j][k] + i];
min = _mm256_min_epu8(min, _mm256_abs_epi8(ymm0));
sgn = _mm256_sign_epi8(sgn, ymm0);
}
// Store result
min = _mm256_min_epu8(min, *p_maxLLR); // 128 in epi8 is -127
*p_cnProcBufResBit = _mm256_sign_epi8(min, sgn);
p_cnProcBufResBit++;
}
}
}
// =====================================================================
// Process group with 10 BNs
// Offset is 1*384/32 = 12
const uint8_t lut_idxCnProcG10[10][9] = {{12,24,36,48,60,72,84,96,108}, {0,24,36,48,60,72,84,96,108},
{0,12,36,48,60,72,84,96,108}, {0,12,24,48,60,72,84,96,108},
{0,12,24,36,60,72,84,96,108}, {0,12,24,36,48,72,84,96,108},
{0,12,24,36,48,60,84,96,108}, {0,12,24,36,48,60,72,96,108},
{0,12,24,36,48,60,72,84,108}, {0,12,24,36,48,60,72,84,96}};
if (lut_numCnInCnGroups[7] > 0)
{
// Number of groups of 32 CNs for parallel processing
// Ceil for values not divisible by 32
M = (lut_numCnInCnGroups[7]*Z + 31)>>5;
// Set the offset to each bit within a group in terms of 32 Byte
bitOffsetInGroup = (lut_numCnInCnGroups_BG1_R13[7]*NR_LDPC_ZMAX)>>5;
// Set pointers to start of group 10
p_cnProcBuf = (__m256i*) &cnProcBuf [lut_startAddrCnGroups[7]];
p_cnProcBufRes = (__m256i*) &cnProcBufRes[lut_startAddrCnGroups[7]];
// Loop over every BN
for (j=0; j<10; j++)
{
// Set of results pointer to correct BN address
p_cnProcBufResBit = p_cnProcBufRes + (j*bitOffsetInGroup);
// Loop over CNs
for (i=0; i<M; i++)
{
// Abs and sign of 32 CNs (first BN)
ymm0 = p_cnProcBuf[lut_idxCnProcG10[j][0] + i];
sgn = _mm256_sign_epi8(*p_ones, ymm0);
min = _mm256_abs_epi8(ymm0);
// Loop over BNs
for (k=1; k<9; k++)
{
ymm0 = p_cnProcBuf[lut_idxCnProcG10[j][k] + i];
min = _mm256_min_epu8(min, _mm256_abs_epi8(ymm0));
sgn = _mm256_sign_epi8(sgn, ymm0);
}
// Store result
min = _mm256_min_epu8(min, *p_maxLLR); // 128 in epi8 is -127
*p_cnProcBufResBit = _mm256_sign_epi8(min, sgn);
p_cnProcBufResBit++;
}
}
}
// =====================================================================
// Process group with 19 BNs
// Offset is 4*384/32 = 12
const uint16_t lut_idxCnProcG19[19][18] = {{48,96,144,192,240,288,336,384,432,480,528,576,624,672,720,768,816,864}, {0,96,144,192,240,288,336,384,432,480,528,576,624,672,720,768,816,864},
{0,48,144,192,240,288,336,384,432,480,528,576,624,672,720,768,816,864}, {0,48,96,192,240,288,336,384,432,480,528,576,624,672,720,768,816,864},
{0,48,96,144,240,288,336,384,432,480,528,576,624,672,720,768,816,864}, {0,48,96,144,192,288,336,384,432,480,528,576,624,672,720,768,816,864},
{0,48,96,144,192,240,336,384,432,480,528,576,624,672,720,768,816,864}, {0,48,96,144,192,240,288,384,432,480,528,576,624,672,720,768,816,864},
{0,48,96,144,192,240,288,336,432,480,528,576,624,672,720,768,816,864}, {0,48,96,144,192,240,288,336,384,480,528,576,624,672,720,768,816,864},
{0,48,96,144,192,240,288,336,384,432,528,576,624,672,720,768,816,864}, {0,48,96,144,192,240,288,336,384,432,480,576,624,672,720,768,816,864},
{0,48,96,144,192,240,288,336,384,432,480,528,624,672,720,768,816,864}, {0,48,96,144,192,240,288,336,384,432,480,528,576,672,720,768,816,864},
{0,48,96,144,192,240,288,336,384,432,480,528,576,624,720,768,816,864}, {0,48,96,144,192,240,288,336,384,432,480,528,576,624,672,768,816,864},
{0,48,96,144,192,240,288,336,384,432,480,528,576,624,672,720,816,864}, {0,48,96,144,192,240,288,336,384,432,480,528,576,624,672,720,768,864},
{0,48,96,144,192,240,288,336,384,432,480,528,576,624,672,720,768,816}};
if (lut_numCnInCnGroups[8] > 0)
{
// Number of groups of 32 CNs for parallel processing
// Ceil for values not divisible by 32
M = (lut_numCnInCnGroups[8]*Z + 31)>>5;
// Set the offset to each bit within a group in terms of 32 Byte
bitOffsetInGroup = (lut_numCnInCnGroups_BG1_R13[8]*NR_LDPC_ZMAX)>>5;
// Set pointers to start of group 19
p_cnProcBuf = (__m256i*) &cnProcBuf [lut_startAddrCnGroups[8]];
p_cnProcBufRes = (__m256i*) &cnProcBufRes[lut_startAddrCnGroups[8]];
// Loop over every BN
for (j=0; j<19; j++)
{
// Set of results pointer to correct BN address
p_cnProcBufResBit = p_cnProcBufRes + (j*bitOffsetInGroup);
// Loop over CNs
for (i=0; i<M; i++)
{
// Abs and sign of 32 CNs (first BN)
ymm0 = p_cnProcBuf[lut_idxCnProcG19[j][0] + i];
sgn = _mm256_sign_epi8(*p_ones, ymm0);
min = _mm256_abs_epi8(ymm0);
// Loop over BNs
for (k=1; k<18; k++)
{
ymm0 = p_cnProcBuf[lut_idxCnProcG19[j][k] + i];
min = _mm256_min_epu8(min, _mm256_abs_epi8(ymm0));
sgn = _mm256_sign_epi8(sgn, ymm0);
}
// Store result
min = _mm256_min_epu8(min, *p_maxLLR); // 128 in epi8 is -127
*p_cnProcBufResBit = _mm256_sign_epi8(min, sgn);
p_cnProcBufResBit++;
}
}
}
}
#endif
/**
\brief Performs parity check for BG1 on the CN processing buffer. Stops as soon as error is detected.
\param p_lut Pointer to decoder LUTs
\param Z Lifting size
\return 32-bit parity check indicator
*/
static inline uint32_t nrLDPC_cnProcPc_BG1(t_nrLDPC_lut* p_lut, t_nrLDPC_procBuf* p_procBuf, uint16_t Z)
{
const uint8_t* lut_numCnInCnGroups = p_lut->numCnInCnGroups;
const uint32_t* lut_startAddrCnGroups = p_lut->startAddrCnGroups;
int8_t* cnProcBuf = p_procBuf->cnProcBuf;
int8_t* cnProcBufRes = p_procBuf->cnProcBufRes;
__m256i* p_cnProcBuf;
__m256i* p_cnProcBufRes;
// Number of CNs in Groups
uint32_t M;
uint32_t i;
uint32_t j;
uint32_t pcRes = 0;
uint32_t pcResSum = 0;
uint32_t Mrem;
uint32_t M32;
__m256i ymm0, ymm1;
// =====================================================================
// Process group with 3 BNs
if (lut_numCnInCnGroups[0] > 0)
{
// Reset results
pcResSum = 0;
// Number of CNs in group
M = lut_numCnInCnGroups[0]*Z;
// Remainder modulo 32
Mrem = M&31;
// Number of groups of 32 CNs for parallel processing
// Ceil for values not divisible by 32
M32 = (M + 31)>>5;
// Set pointers to start of group 3
p_cnProcBuf = (__m256i*) &cnProcBuf [lut_startAddrCnGroups[0]];
p_cnProcBufRes = (__m256i*) &cnProcBufRes[lut_startAddrCnGroups[0]];
// Loop over CNs
for (i=0; i<(M32-1); i++)
{
pcRes = 0;
// Loop over every BN
// Compute PC for 32 CNs at once
for (j=0; j<3; j++)
{
// BN offset is units of (1*384/32) = 12
ymm0 = p_cnProcBuf [j*12 + i];
ymm1 = p_cnProcBufRes[j*12 + i];
// Add BN and input LLR, extract the sign bit
// and add in GF(2) (xor)
pcRes ^= _mm256_movemask_epi8(_mm256_adds_epi8(ymm0,ymm1));
}
// If no error pcRes should be 0
pcResSum |= pcRes;
}
// Last 32 CNs might not be full valid 32 depending on Z
pcRes = 0;
// Loop over every BN
// Compute PC for 32 CNs at once
for (j=0; j<3; j++)
{
// BN offset is units of (1*384/32) = 12
ymm0 = p_cnProcBuf [j*12 + i];
ymm1 = p_cnProcBufRes[j*12 + i];
// Add BN and input LLR, extract the sign bit
// and add in GF(2) (xor)
pcRes ^= _mm256_movemask_epi8(_mm256_adds_epi8(ymm0,ymm1));
}
// If no error pcRes should be 0
// Only use valid CNs
pcResSum |= (pcRes&(0xFFFFFFFF>>(32-Mrem)));
// If PC failed we can stop here
if (pcResSum > 0)
{
return pcResSum;
}
}
// =====================================================================
// Process group with 4 BNs
if (lut_numCnInCnGroups[1] > 0)
{
// Reset results
pcResSum = 0;
// Number of CNs in group
M = lut_numCnInCnGroups[1]*Z;
// Remainder modulo 32
Mrem = M&31;
// Number of groups of 32 CNs for parallel processing
// Ceil for values not divisible by 32
M32 = (M + 31)>>5;
// Set pointers to start of group 4
p_cnProcBuf = (__m256i*) &cnProcBuf [lut_startAddrCnGroups[1]];
p_cnProcBufRes = (__m256i*) &cnProcBufRes[lut_startAddrCnGroups[1]];
// Loop over CNs
for (i=0; i<(M32-1); i++)
{
pcRes = 0;
// Loop over every BN
// Compute PC for 32 CNs at once
for (j=0; j<4; j++)
{
// BN offset is units of 5*384/32 = 60
ymm0 = p_cnProcBuf [j*60 + i];
ymm1 = p_cnProcBufRes[j*60 + i];
// Add BN and input LLR, extract the sign bit
// and add in GF(2) (xor)
pcRes ^= _mm256_movemask_epi8(_mm256_adds_epi8(ymm0,ymm1));
}
// If no error pcRes should be 0
pcResSum |= pcRes;
}
// Last 32 CNs might not be full valid 32 depending on Z
pcRes = 0;
// Loop over every BN
// Compute PC for 32 CNs at once
for (j=0; j<4; j++)
{
// BN offset is units of 5*384/32 = 60
ymm0 = p_cnProcBuf [j*60 + i];
ymm1 = p_cnProcBufRes[j*60 + i];
// Add BN and input LLR, extract the sign bit
// and add in GF(2) (xor)
pcRes ^= _mm256_movemask_epi8(_mm256_adds_epi8(ymm0,ymm1));
}
// If no error pcRes should be 0
// Only use valid CNs
pcResSum |= (pcRes&(0xFFFFFFFF>>(32-Mrem)));
// If PC failed we can stop here
if (pcResSum > 0)
{
return pcResSum;
}
}
// =====================================================================
// Process group with 5 BNs
if (lut_numCnInCnGroups[2] > 0)
{
// Reset results
pcResSum = 0;
// Number of CNs in group
M = lut_numCnInCnGroups[2]*Z;
// Remainder modulo 32
Mrem = M&31;
// Number of groups of 32 CNs for parallel processing
// Ceil for values not divisible by 32
M32 = (M + 31)>>5;
// Set pointers to start of group 5
p_cnProcBuf = (__m256i*) &cnProcBuf [lut_startAddrCnGroups[2]];
p_cnProcBufRes = (__m256i*) &cnProcBufRes[lut_startAddrCnGroups[2]];
// Loop over CNs
for (i=0; i<(M32-1); i++)
{
pcRes = 0;
// Loop over every BN
// Compute PC for 32 CNs at once
for (j=0; j<5; j++)
{
// BN offset is units of 18*384/32 = 216
ymm0 = p_cnProcBuf [j*216 + i];
ymm1 = p_cnProcBufRes[j*216 + i];
// Add BN and input LLR, extract the sign bit
// and add in GF(2) (xor)
pcRes ^= _mm256_movemask_epi8(_mm256_adds_epi8(ymm0,ymm1));
}
// If no error pcRes should be 0
pcResSum |= pcRes;
}
// Last 32 CNs might not be full valid 32 depending on Z
pcRes = 0;
// Loop over every BN
// Compute PC for 32 CNs at once
for (j=0; j<5; j++)
{
// BN offset is units of 18*384/32 = 216
ymm0 = p_cnProcBuf [j*216 + i];
ymm1 = p_cnProcBufRes[j*216 + i];
// Add BN and input LLR, extract the sign bit
// and add in GF(2) (xor)
pcRes ^= _mm256_movemask_epi8(_mm256_adds_epi8(ymm0,ymm1));
}
// If no error pcRes should be 0
// Only use valid CNs
pcResSum |= (pcRes&(0xFFFFFFFF>>(32-Mrem)));
// If PC failed we can stop here
if (pcResSum > 0)
{
return pcResSum;
}
}
// =====================================================================
// Process group with 6 BNs
if (lut_numCnInCnGroups[3] > 0)
{
// Reset results
pcResSum = 0;
// Number of CNs in group
M = lut_numCnInCnGroups[3]*Z;
// Remainder modulo 32
Mrem = M&31;
// Number of groups of 32 CNs for parallel processing
// Ceil for values not divisible by 32
M32 = (M + 31)>>5;
// Set pointers to start of group 6
p_cnProcBuf = (__m256i*) &cnProcBuf [lut_startAddrCnGroups[3]];
p_cnProcBufRes = (__m256i*) &cnProcBufRes[lut_startAddrCnGroups[3]];
// Loop over CNs
for (i=0; i<(M32-1); i++)
{
pcRes = 0;
// Loop over every BN
// Compute PC for 32 CNs at once
for (j=0; j<6; j++)
{
// BN offset is units of 8*384/32 = 96
ymm0 = p_cnProcBuf [j*96 + i];
ymm1 = p_cnProcBufRes[j*96 + i];
// Add BN and input LLR, extract the sign bit
// and add in GF(2) (xor)
pcRes ^= _mm256_movemask_epi8(_mm256_adds_epi8(ymm0,ymm1));
}
// If no error pcRes should be 0
pcResSum |= pcRes;
}
// Last 32 CNs might not be full valid 32 depending on Z
pcRes = 0;
// Loop over every BN
// Compute PC for 32 CNs at once
for (j=0; j<6; j++)
{
// BN offset is units of 8*384/32 = 96
ymm0 = p_cnProcBuf [j*96 + i];
ymm1 = p_cnProcBufRes[j*96 + i];
// Add BN and input LLR, extract the sign bit
// and add in GF(2) (xor)
pcRes ^= _mm256_movemask_epi8(_mm256_adds_epi8(ymm0,ymm1));
}
// If no error pcRes should be 0
// Only use valid CNs
pcResSum |= (pcRes&(0xFFFFFFFF>>(32-Mrem)));
// If PC failed we can stop here
if (pcResSum > 0)
{
return pcResSum;
}
}
// =====================================================================
// Process group with 7 BNs
if (lut_numCnInCnGroups[4] > 0)
{
// Reset results
pcResSum = 0;
// Number of CNs in group
M = lut_numCnInCnGroups[4]*Z;
// Remainder modulo 32
Mrem = M&31;
// Number of groups of 32 CNs for parallel processing
// Ceil for values not divisible by 32
M32 = (M + 31)>>5;
// Set pointers to start of group 7
p_cnProcBuf = (__m256i*) &cnProcBuf [lut_startAddrCnGroups[4]];
p_cnProcBufRes = (__m256i*) &cnProcBufRes[lut_startAddrCnGroups[4]];
// Loop over CNs
for (i=0; i<(M32-1); i++)
{
pcRes = 0;
// Loop over every BN
// Compute PC for 32 CNs at once
for (j=0; j<7; j++)
{
// BN offset is units of 5*384/32 = 60
ymm0 = p_cnProcBuf [j*60 + i];
ymm1 = p_cnProcBufRes[j*60 + i];
// Add BN and input LLR, extract the sign bit
// and add in GF(2) (xor)
pcRes ^= _mm256_movemask_epi8(_mm256_adds_epi8(ymm0,ymm1));
}
// If no error pcRes should be 0
pcResSum |= pcRes;
}
// Last 32 CNs might not be full valid 32 depending on Z
pcRes = 0;
// Loop over every BN
// Compute PC for 32 CNs at once
for (j=0; j<7; j++)
{
// BN offset is units of 5*384/32 = 60
ymm0 = p_cnProcBuf [j*60 + i];
ymm1 = p_cnProcBufRes[j*60 + i];
// Add BN and input LLR, extract the sign bit
// and add in GF(2) (xor)
pcRes ^= _mm256_movemask_epi8(_mm256_adds_epi8(ymm0,ymm1));
}
// If no error pcRes should be 0
// Only use valid CNs
pcResSum |= (pcRes&(0xFFFFFFFF>>(32-Mrem)));
// If PC failed we can stop here
if (pcResSum > 0)
{
return pcResSum;
}
}
// =====================================================================
// Process group with 8 BNs
if (lut_numCnInCnGroups[5] > 0)
{
// Reset results
pcResSum = 0;
// Number of CNs in group
M = lut_numCnInCnGroups[5]*Z;
// Remainder modulo 32
Mrem = M&31;
// Number of groups of 32 CNs for parallel processing
// Ceil for values not divisible by 32
M32 = (M + 31)>>5;
// Set pointers to start of group 8
p_cnProcBuf = (__m256i*) &cnProcBuf [lut_startAddrCnGroups[5]];
p_cnProcBufRes = (__m256i*) &cnProcBufRes[lut_startAddrCnGroups[5]];
// Loop over CNs
for (i=0; i<(M32-1); i++)
{
pcRes = 0;
// Loop over every BN
// Compute PC for 32 CNs at once
for (j=0; j<8; j++)
{
// BN offset is units of 2*384/32 = 24
ymm0 = p_cnProcBuf [j*24 + i];
ymm1 = p_cnProcBufRes[j*24 + i];
// Add BN and input LLR, extract the sign bit
// and add in GF(2) (xor)
pcRes ^= _mm256_movemask_epi8(_mm256_adds_epi8(ymm0,ymm1));
}
// If no error pcRes should be 0
pcResSum |= pcRes;
}
// Last 32 CNs might not be full valid 32 depending on Z
pcRes = 0;
// Loop over every BN
// Compute PC for 32 CNs at once
for (j=0; j<8; j++)
{
// BN offset is units of 2*384/32 = 24
ymm0 = p_cnProcBuf [j*24 + i];
ymm1 = p_cnProcBufRes[j*24 + i];
// Add BN and input LLR, extract the sign bit
// and add in GF(2) (xor)
pcRes ^= _mm256_movemask_epi8(_mm256_adds_epi8(ymm0,ymm1));
}
// If no error pcRes should be 0
// Only use valid CNs
pcResSum |= (pcRes&(0xFFFFFFFF>>(32-Mrem)));
// If PC failed we can stop here
if (pcResSum > 0)
{
return pcResSum;
}
}
// =====================================================================
// Process group with 9 BNs
if (lut_numCnInCnGroups[6] > 0)
{
// Reset results
pcResSum = 0;
// Number of CNs in group
M = lut_numCnInCnGroups[6]*Z;
// Remainder modulo 32
Mrem = M&31;
// Number of groups of 32 CNs for parallel processing
// Ceil for values not divisible by 32
M32 = (M + 31)>>5;
// Set pointers to start of group 9
p_cnProcBuf = (__m256i*) &cnProcBuf [lut_startAddrCnGroups[6]];
p_cnProcBufRes = (__m256i*) &cnProcBufRes[lut_startAddrCnGroups[6]];
// Loop over CNs
for (i=0; i<(M32-1); i++)
{
pcRes = 0;
// Loop over every BN
// Compute PC for 32 CNs at once
for (j=0; j<9; j++)
{
// BN offset is units of 2*384/32 = 24
ymm0 = p_cnProcBuf [j*24 + i];
ymm1 = p_cnProcBufRes[j*24 + i];
// Add BN and input LLR, extract the sign bit
// and add in GF(2) (xor)
pcRes ^= _mm256_movemask_epi8(_mm256_adds_epi8(ymm0,ymm1));
}
// If no error pcRes should be 0
pcResSum |= pcRes;
}
// Last 32 CNs might not be full valid 32 depending on Z
pcRes = 0;
// Loop over every BN
// Compute PC for 32 CNs at once
for (j=0; j<9; j++)
{
// BN offset is units of 2*384/32 = 24
ymm0 = p_cnProcBuf [j*24 + i];
ymm1 = p_cnProcBufRes[j*24 + i];
// Add BN and input LLR, extract the sign bit
// and add in GF(2) (xor)
pcRes ^= _mm256_movemask_epi8(_mm256_adds_epi8(ymm0,ymm1));
}
// If no error pcRes should be 0
// Only use valid CNs
pcResSum |= (pcRes&(0xFFFFFFFF>>(32-Mrem)));
// If PC failed we can stop here
if (pcResSum > 0)
{
return pcResSum;
}
}
// =====================================================================
// Process group with 10 BNs
if (lut_numCnInCnGroups[7] > 0)
{
// Reset results
pcResSum = 0;
// Number of CNs in group
M = lut_numCnInCnGroups[7]*Z;
// Remainder modulo 32
Mrem = M&31;
// Number of groups of 32 CNs for parallel processing
// Ceil for values not divisible by 32
M32 = (M + 31)>>5;
// Set pointers to start of group 10
p_cnProcBuf = (__m256i*) &cnProcBuf [lut_startAddrCnGroups[7]];
p_cnProcBufRes = (__m256i*) &cnProcBufRes[lut_startAddrCnGroups[7]];
// Loop over CNs
for (i=0; i<(M32-1); i++)
{
pcRes = 0;
// Loop over every BN
// Compute PC for 32 CNs at once
for (j=0; j<10; j++)
{
// BN offset is units of 1*384/32 = 12
ymm0 = p_cnProcBuf [j*12 + i];
ymm1 = p_cnProcBufRes[j*12 + i];
// Add BN and input LLR, extract the sign bit
// and add in GF(2) (xor)
pcRes ^= _mm256_movemask_epi8(_mm256_adds_epi8(ymm0,ymm1));
}
// If no error pcRes should be 0
pcResSum |= pcRes;
}
// Last 32 CNs might not be full valid 32 depending on Z
pcRes = 0;
// Loop over every BN
// Compute PC for 32 CNs at once
for (j=0; j<10; j++)
{
// BN offset is units of 1*384/32 = 12
ymm0 = p_cnProcBuf [j*12 + i];
ymm1 = p_cnProcBufRes[j*12 + i];
// Add BN and input LLR, extract the sign bit
// and add in GF(2) (xor)
pcRes ^= _mm256_movemask_epi8(_mm256_adds_epi8(ymm0,ymm1));
}
// If no error pcRes should be 0
// Only use valid CNs
pcResSum |= (pcRes&(0xFFFFFFFF>>(32-Mrem)));
// If PC failed we can stop here
if (pcResSum > 0)
{
return pcResSum;
}
}
// =====================================================================
// Process group with 19 BNs
if (lut_numCnInCnGroups[8] > 0)
{
// Reset results
pcResSum = 0;
// Number of CNs in group
M = lut_numCnInCnGroups[8]*Z;
// Remainder modulo 32
Mrem = M&31;
// Number of groups of 32 CNs for parallel processing
// Ceil for values not divisible by 32
M32 = (M + 31)>>5;
// Set pointers to start of group 19
p_cnProcBuf = (__m256i*) &cnProcBuf [lut_startAddrCnGroups[8]];
p_cnProcBufRes = (__m256i*) &cnProcBufRes[lut_startAddrCnGroups[8]];
// Loop over CNs
for (i=0; i<(M32-1); i++)
{
pcRes = 0;
// Loop over every BN (Last BN is connected to multiple CNs)
// Compute PC for 32 CNs at once
for (j=0; j<19; j++)
{
// BN offset is units of 4*384/32 = 48
ymm0 = p_cnProcBuf [j*48 + i];
ymm1 = p_cnProcBufRes[j*48 + i];
// Add BN and input LLR, extract the sign bit
// and add in GF(2) (xor)
pcRes ^= _mm256_movemask_epi8(_mm256_adds_epi8(ymm0,ymm1));
}
// If no error pcRes should be 0
pcResSum |= pcRes;
}
// Last 32 CNs might not be full valid 32 depending on Z
pcRes = 0;
// Loop over every BN (Last BN is connected to multiple CNs)
// Compute PC for 32 CNs at once
for (j=0; j<19; j++)
{
// BN offset is units of 4*384/32 = 48
ymm0 = p_cnProcBuf [j*48 + i];
ymm1 = p_cnProcBufRes[j*48 + i];
// Add BN and input LLR, extract the sign bit
// and add in GF(2) (xor)
pcRes ^= _mm256_movemask_epi8(_mm256_adds_epi8(ymm0,ymm1));
}
// If no error pcRes should be 0
// Only use valid CNs
pcResSum |= (pcRes&(0xFFFFFFFF>>(32-Mrem)));
// If PC failed we can stop here
if (pcResSum > 0)
{
return pcResSum;
}
}
return pcResSum;
}
/**
\brief Performs parity check for BG2 on the CN processing buffer. Stops as soon as error is detected.
\param p_lut Pointer to decoder LUTs
\param Z Lifting size
\return 32-bit parity check indicator
*/
static inline uint32_t nrLDPC_cnProcPc_BG2(t_nrLDPC_lut* p_lut, t_nrLDPC_procBuf* p_procBuf, uint16_t Z)
{
const uint8_t* lut_numCnInCnGroups = p_lut->numCnInCnGroups;
const uint32_t* lut_startAddrCnGroups = p_lut->startAddrCnGroups;
int8_t* cnProcBuf = p_procBuf->cnProcBuf;
int8_t* cnProcBufRes = p_procBuf->cnProcBufRes;
__m256i* p_cnProcBuf;
__m256i* p_cnProcBufRes;
// Number of CNs in Groups
uint32_t M;
uint32_t i;
uint32_t j;
uint32_t pcRes = 0;
uint32_t pcResSum = 0;
uint32_t Mrem;
uint32_t M32;
__m256i ymm0, ymm1;
// =====================================================================
// Process group with 3 BNs
if (lut_numCnInCnGroups[0] > 0)
{
// Reset results
pcResSum = 0;
// Number of CNs in group
M = lut_numCnInCnGroups[0]*Z;
// Remainder modulo 32
Mrem = M&31;
// Number of groups of 32 CNs for parallel processing
// Ceil for values not divisible by 32
M32 = (M + 31)>>5;
// Set pointers to start of group 3
p_cnProcBuf = (__m256i*) &cnProcBuf [lut_startAddrCnGroups[0]];
p_cnProcBufRes = (__m256i*) &cnProcBufRes[lut_startAddrCnGroups[0]];
// Loop over CNs
for (i=0; i<(M32-1); i++)
{
pcRes = 0;
// Loop over every BN
// Compute PC for 32 CNs at once
for (j=0; j<3; j++)
{
// BN offset is units of (6*384/32) = 72
ymm0 = p_cnProcBuf [j*72 + i];
ymm1 = p_cnProcBufRes[j*72 + i];
// Add BN and input LLR, extract the sign bit
// and add in GF(2) (xor)
pcRes ^= _mm256_movemask_epi8(_mm256_adds_epi8(ymm0,ymm1));
}
// If no error pcRes should be 0
pcResSum |= pcRes;
}
// Last 32 CNs might not be full valid 32 depending on Z
pcRes = 0;
// Loop over every BN
// Compute PC for 32 CNs at once
for (j=0; j<3; j++)
{
// BN offset is units of (6*384/32) = 72
ymm0 = p_cnProcBuf [j*72 + i];
ymm1 = p_cnProcBufRes[j*72 + i];
// Add BN and input LLR, extract the sign bit
// and add in GF(2) (xor)
pcRes ^= _mm256_movemask_epi8(_mm256_adds_epi8(ymm0,ymm1));
}
// If no error pcRes should be 0
// Only use valid CNs
pcResSum |= (pcRes&(0xFFFFFFFF>>(32-Mrem)));
// If PC failed we can stop here
if (pcResSum > 0)
{
return pcResSum;
}
}
// =====================================================================
// Process group with 4 BNs
if (lut_numCnInCnGroups[1] > 0)
{
// Reset results
pcResSum = 0;
// Number of CNs in group
M = lut_numCnInCnGroups[1]*Z;
// Remainder modulo 32
Mrem = M&31;
// Number of groups of 32 CNs for parallel processing
// Ceil for values not divisible by 32
M32 = (M + 31)>>5;
// Set pointers to start of group 4
p_cnProcBuf = (__m256i*) &cnProcBuf [lut_startAddrCnGroups[1]];
p_cnProcBufRes = (__m256i*) &cnProcBufRes[lut_startAddrCnGroups[1]];
// Loop over CNs
for (i=0; i<(M32-1); i++)
{
pcRes = 0;
// Loop over every BN
// Compute PC for 32 CNs at once
for (j=0; j<4; j++)
{
// BN offset is units of 20*384/32 = 240
ymm0 = p_cnProcBuf [j*240 + i];
ymm1 = p_cnProcBufRes[j*240 + i];
// Add BN and input LLR, extract the sign bit
// and add in GF(2) (xor)
pcRes ^= _mm256_movemask_epi8(_mm256_adds_epi8(ymm0,ymm1));
}
// If no error pcRes should be 0
pcResSum |= pcRes;
}
// Last 32 CNs might not be full valid 32 depending on Z
pcRes = 0;
// Loop over every BN
// Compute PC for 32 CNs at once
for (j=0; j<4; j++)
{
// BN offset is units of 20*384/32 = 240
ymm0 = p_cnProcBuf [j*240 + i];
ymm1 = p_cnProcBufRes[j*240 + i];
// Add BN and input LLR, extract the sign bit
// and add in GF(2) (xor)
pcRes ^= _mm256_movemask_epi8(_mm256_adds_epi8(ymm0,ymm1));
}
// If no error pcRes should be 0
// Only use valid CNs
pcResSum |= (pcRes&(0xFFFFFFFF>>(32-Mrem)));
// If PC failed we can stop here
if (pcResSum > 0)
{
return pcResSum;
}
}
// =====================================================================
// Process group with 5 BNs
if (lut_numCnInCnGroups[2] > 0)
{
// Reset results
pcResSum = 0;
// Number of CNs in group
M = lut_numCnInCnGroups[2]*Z;
// Remainder modulo 32
Mrem = M&31;
// Number of groups of 32 CNs for parallel processing
// Ceil for values not divisible by 32
M32 = (M + 31)>>5;
// Set pointers to start of group 5
p_cnProcBuf = (__m256i*) &cnProcBuf [lut_startAddrCnGroups[2]];
p_cnProcBufRes = (__m256i*) &cnProcBufRes[lut_startAddrCnGroups[2]];
// Loop over CNs
for (i=0; i<(M32-1); i++)
{
pcRes = 0;
// Loop over every BN
// Compute PC for 32 CNs at once
for (j=0; j<5; j++)
{
// BN offset is units of 9*384/32 = 108
ymm0 = p_cnProcBuf [j*108 + i];
ymm1 = p_cnProcBufRes[j*108 + i];
// Add BN and input LLR, extract the sign bit
// and add in GF(2) (xor)
pcRes ^= _mm256_movemask_epi8(_mm256_adds_epi8(ymm0,ymm1));
}
// If no error pcRes should be 0
pcResSum |= pcRes;
}
// Last 32 CNs might not be full valid 32 depending on Z
pcRes = 0;
// Loop over every BN
// Compute PC for 32 CNs at once
for (j=0; j<5; j++)
{
// BN offset is units of 9*384/32 = 108
ymm0 = p_cnProcBuf [j*108 + i];
ymm1 = p_cnProcBufRes[j*108 + i];
// Add BN and input LLR, extract the sign bit
// and add in GF(2) (xor)
pcRes ^= _mm256_movemask_epi8(_mm256_adds_epi8(ymm0,ymm1));
}
// If no error pcRes should be 0
// Only use valid CNs
pcResSum |= (pcRes&(0xFFFFFFFF>>(32-Mrem)));
// If PC failed we can stop here
if (pcResSum > 0)
{
return pcResSum;
}
}
// =====================================================================
// Process group with 6 BNs
if (lut_numCnInCnGroups[3] > 0)
{
// Reset results
pcResSum = 0;
// Number of CNs in group
M = lut_numCnInCnGroups[3]*Z;
// Remainder modulo 32
Mrem = M&31;
// Number of groups of 32 CNs for parallel processing
// Ceil for values not divisible by 32
M32 = (M + 31)>>5;
// Set pointers to start of group 6
p_cnProcBuf = (__m256i*) &cnProcBuf [lut_startAddrCnGroups[3]];
p_cnProcBufRes = (__m256i*) &cnProcBufRes[lut_startAddrCnGroups[3]];
// Loop over CNs
for (i=0; i<(M32-1); i++)
{
pcRes = 0;
// Loop over every BN
// Compute PC for 32 CNs at once
for (j=0; j<6; j++)
{
// BN offset is units of 3*384/32 = 36
ymm0 = p_cnProcBuf [j*36 + i];
ymm1 = p_cnProcBufRes[j*36 + i];
// Add BN and input LLR, extract the sign bit
// and add in GF(2) (xor)
pcRes ^= _mm256_movemask_epi8(_mm256_adds_epi8(ymm0,ymm1));
}
// If no error pcRes should be 0
pcResSum |= pcRes;
}
// Last 32 CNs might not be full valid 32 depending on Z
pcRes = 0;
// Loop over every BN
// Compute PC for 32 CNs at once
for (j=0; j<6; j++)
{
// BN offset is units of 3*384/32 = 36
ymm0 = p_cnProcBuf [j*36 + i];
ymm1 = p_cnProcBufRes[j*36 + i];
// Add BN and input LLR, extract the sign bit
// and add in GF(2) (xor)
pcRes ^= _mm256_movemask_epi8(_mm256_adds_epi8(ymm0,ymm1));
}
// If no error pcRes should be 0
// Only use valid CNs
pcResSum |= (pcRes&(0xFFFFFFFF>>(32-Mrem)));
// If PC failed we can stop here
if (pcResSum > 0)
{
return pcResSum;
}
}
// =====================================================================
// Process group with 8 BNs
if (lut_numCnInCnGroups[4] > 0)
{
// Reset results
pcResSum = 0;
// Number of CNs in group
M = lut_numCnInCnGroups[4]*Z;
// Remainder modulo 32
Mrem = M&31;
// Number of groups of 32 CNs for parallel processing
// Ceil for values not divisible by 32
M32 = (M + 31)>>5;
// Set pointers to start of group 8
p_cnProcBuf = (__m256i*) &cnProcBuf [lut_startAddrCnGroups[4]];
p_cnProcBufRes = (__m256i*) &cnProcBufRes[lut_startAddrCnGroups[4]];
// Loop over CNs
for (i=0; i<(M32-1); i++)
{
pcRes = 0;
// Loop over every BN
// Compute PC for 32 CNs at once
for (j=0; j<8; j++)
{
// BN offset is units of 2*384/32 = 24
ymm0 = p_cnProcBuf [j*24 + i];
ymm1 = p_cnProcBufRes[j*24 + i];
// Add BN and input LLR, extract the sign bit
// and add in GF(2) (xor)
pcRes ^= _mm256_movemask_epi8(_mm256_adds_epi8(ymm0,ymm1));
}
// If no error pcRes should be 0
pcResSum |= pcRes;
}
// Last 32 CNs might not be full valid 32 depending on Z
pcRes = 0;
// Loop over every BN
// Compute PC for 32 CNs at once
for (j=0; j<8; j++)
{
// BN offset is units of 2*384/32 = 24
ymm0 = p_cnProcBuf [j*24 + i];
ymm1 = p_cnProcBufRes[j*24 + i];
// Add BN and input LLR, extract the sign bit
// and add in GF(2) (xor)
pcRes ^= _mm256_movemask_epi8(_mm256_adds_epi8(ymm0,ymm1));
}
// If no error pcRes should be 0
// Only use valid CNs
pcResSum |= (pcRes&(0xFFFFFFFF>>(32-Mrem)));
// If PC failed we can stop here
if (pcResSum > 0)
{
return pcResSum;
}
}
// =====================================================================
// Process group with 10 BNs
if (lut_numCnInCnGroups[5] > 0)
{
// Reset results
pcResSum = 0;
// Number of CNs in group
M = lut_numCnInCnGroups[5]*Z;
// Remainder modulo 32
Mrem = M&31;
// Number of groups of 32 CNs for parallel processing
// Ceil for values not divisible by 32
M32 = (M + 31)>>5;
// Set pointers to start of group 10
p_cnProcBuf = (__m256i*) &cnProcBuf [lut_startAddrCnGroups[5]];
p_cnProcBufRes = (__m256i*) &cnProcBufRes[lut_startAddrCnGroups[5]];
// Loop over CNs
for (i=0; i<(M32-1); i++)
{
pcRes = 0;
// Loop over every BN
// Compute PC for 32 CNs at once
for (j=0; j<10; j++)
{
// BN offset is units of 2*384/32 = 24
ymm0 = p_cnProcBuf [j*24 + i];
ymm1 = p_cnProcBufRes[j*24 + i];
// Add BN and input LLR, extract the sign bit
// and add in GF(2) (xor)
pcRes ^= _mm256_movemask_epi8(_mm256_adds_epi8(ymm0,ymm1));
}
// If no error pcRes should be 0
pcResSum |= pcRes;
}
// Last 32 CNs might not be full valid 32 depending on Z
pcRes = 0;
// Loop over every BN
// Compute PC for 32 CNs at once
for (j=0; j<10; j++)
{
// BN offset is units of 2*384/32 = 24
ymm0 = p_cnProcBuf [j*24 + i];
ymm1 = p_cnProcBufRes[j*24 + i];
// Add BN and input LLR, extract the sign bit
// and add in GF(2) (xor)
pcRes ^= _mm256_movemask_epi8(_mm256_adds_epi8(ymm0,ymm1));
}
// If no error pcRes should be 0
// Only use valid CNs
pcResSum |= (pcRes&(0xFFFFFFFF>>(32-Mrem)));
// If PC failed we can stop here
if (pcResSum > 0)
{
return pcResSum;
}
}
return pcResSum;
}
#endif
......@@ -33,8 +33,10 @@
#include "nrLDPC_mPass.h"
#include "nrLDPC_cnProc.h"
#include "nrLDPC_bnProc.h"
#define UNROLL_CN_PROC 1
#define UNROLL_BN_PROC 1
#define UNROLL_BN_PROC_PC 1
#define UNROLL_BN2CN_PROC 1
/*----------------------------------------------------------------------
| cn processing files -->AVX512
/----------------------------------------------------------------------*/
......@@ -197,14 +199,8 @@ static inline uint32_t nrLDPC_decoder_core(int8_t* p_llr, int8_t* p_out, uint32_
#ifdef NR_LDPC_PROFILER_DETAIL
start_meas(&p_profiler->llr2CnProcBuf);
#endif
if (BG == 1)
{
nrLDPC_llr2CnProcBuf_BG1(p_lut, p_llr, cnProcBuf, Z);
}
else
{
nrLDPC_llr2CnProcBuf_BG2(p_lut, p_llr, cnProcBuf, Z);
}
if (BG == 1) nrLDPC_llr2CnProcBuf_BG1(p_lut, p_llr, cnProcBuf, Z);
else nrLDPC_llr2CnProcBuf_BG2(p_lut, p_llr, cnProcBuf, Z);
#ifdef NR_LDPC_PROFILER_DETAIL
stop_meas(&p_profiler->llr2CnProcBuf);
#endif
......@@ -220,8 +216,10 @@ static inline uint32_t nrLDPC_decoder_core(int8_t* p_llr, int8_t* p_out, uint32_
#ifdef NR_LDPC_PROFILER_DETAIL
start_meas(&p_profiler->cnProc);
#endif
if (BG==1)
{
if (BG==1) {
#ifndef UNROLL_CN_PROC
nrLDPC_cnProc_BG1(p_lut, cnProcBuf, cnProcBufRes, Z);
#else
switch (R)
{
case 13:
......@@ -232,7 +230,7 @@ static inline uint32_t nrLDPC_decoder_core(int8_t* p_llr, int8_t* p_out, uint32_
nrLDPC_cnProc_BG1_R13_AVX2(cnProcBuf, cnProcBufRes, Z);
#endif
break;
}
}
case 23:
{
......@@ -242,7 +240,7 @@ static inline uint32_t nrLDPC_decoder_core(int8_t* p_llr, int8_t* p_out, uint32_
nrLDPC_cnProc_BG1_R23_AVX2(cnProcBuf, cnProcBufRes, Z);
#endif
break;
}
}
case 89:
{
......@@ -252,14 +250,15 @@ static inline uint32_t nrLDPC_decoder_core(int8_t* p_llr, int8_t* p_out, uint32_
nrLDPC_cnProc_BG1_R89_AVX2(cnProcBuf, cnProcBufRes, Z);
#endif
break;
}
}
}
}
else
{
switch (R)
{
#endif
} else {
#ifndef UNROLL_CN_PROC
nrLDPC_cnProc_BG2(p_lut, cnProcBuf, cnProcBufRes, Z);
#else
switch (R) {
case 15:
{
#ifdef __AVX512BW__
......@@ -268,8 +267,7 @@ static inline uint32_t nrLDPC_decoder_core(int8_t* p_llr, int8_t* p_out, uint32_
nrLDPC_cnProc_BG2_R15_AVX2(cnProcBuf, cnProcBufRes, Z);
#endif
break;
}
}
case 13:
{
#ifdef __AVX512BW__
......@@ -278,8 +276,7 @@ static inline uint32_t nrLDPC_decoder_core(int8_t* p_llr, int8_t* p_out, uint32_
nrLDPC_cnProc_BG2_R13_AVX2(cnProcBuf, cnProcBufRes, Z);
#endif
break;
}
}
case 23:
{
#ifdef __AVX512BW__
......@@ -288,10 +285,11 @@ static inline uint32_t nrLDPC_decoder_core(int8_t* p_llr, int8_t* p_out, uint32_
nrLDPC_cnProc_BG2_R23_AVX2(cnProcBuf, cnProcBufRes, Z);
#endif
break;
}
}
}
}
#endif
}
#ifdef NR_LDPC_PROFILER_DETAIL
stop_meas(&p_profiler->cnProc);
#endif
......@@ -304,14 +302,8 @@ static inline uint32_t nrLDPC_decoder_core(int8_t* p_llr, int8_t* p_out, uint32_
#ifdef NR_LDPC_PROFILER_DETAIL
start_meas(&p_profiler->cn2bnProcBuf);
#endif
if (BG == 1)
{
nrLDPC_cn2bnProcBuf_BG1(p_lut, cnProcBufRes, bnProcBuf, Z);
}
else
{
nrLDPC_cn2bnProcBuf_BG2(p_lut, cnProcBufRes, bnProcBuf, Z);
}
if (BG == 1) nrLDPC_cn2bnProcBuf_BG1(p_lut, cnProcBufRes, bnProcBuf, Z);
else nrLDPC_cn2bnProcBuf_BG2(p_lut, cnProcBufRes, bnProcBuf, Z);
#ifdef NR_LDPC_PROFILER_DETAIL
stop_meas(&p_profiler->cn2bnProcBuf);
#endif
......@@ -326,12 +318,12 @@ static inline uint32_t nrLDPC_decoder_core(int8_t* p_llr, int8_t* p_out, uint32_
start_meas(&p_profiler->bnProcPc);
#endif
//nrLDPC_bnProcPc(p_lut, p_procBuf, Z);
if (BG==1)
{
switch (R)
{
#ifndef UNROLL_BN_PROC_PC
nrLDPC_bnProcPc(p_lut, bnProcBuf, bnProcBufRes, llrProcBuf, llrRes, Z);
#else
if (BG==1) {
switch (R) {
case 13:
{
nrLDPC_bnProcPc_BG1_R13_AVX2(bnProcBuf,bnProcBufRes,llrRes, llrProcBuf, Z);
......@@ -348,34 +340,27 @@ static inline uint32_t nrLDPC_decoder_core(int8_t* p_llr, int8_t* p_out, uint32_
break;
}
}
}
else
{
switch (R)
{
} else {
switch (R) {
case 15:
{
nrLDPC_bnProcPc_BG2_R15_AVX2(bnProcBuf,bnProcBufRes, llrRes, llrProcBuf, Z);
break;
}
case 13:
{
nrLDPC_bnProcPc_BG2_R13_AVX2(bnProcBuf,bnProcBufRes,llrRes,llrProcBuf, Z);
break;
}
case 23:
{
nrLDPC_bnProcPc_BG2_R23_AVX2(bnProcBuf,bnProcBufRes,llrRes, llrProcBuf, Z);
break;
}
}
}
}
#endif
#ifdef NR_LDPC_PROFILER_DETAIL
stop_meas(&p_profiler->bnProcPc);
......@@ -389,13 +374,12 @@ static inline uint32_t nrLDPC_decoder_core(int8_t* p_llr, int8_t* p_out, uint32_
#ifdef NR_LDPC_PROFILER_DETAIL
start_meas(&p_profiler->bnProc);
#endif
// nrLDPC_bnProc(p_lut, p_procBuf, Z);
if (BG==1)
{
switch (R)
{
if (BG==1) {
#ifndef UNROLL_BN_PROC
nrLDPC_bnProc(p_lut, bnProcBuf, bnProcBufRes, llrRes, Z);
#else
switch (R) {
case 13:
{
#ifdef __AVX512BW__
......@@ -424,11 +408,12 @@ if (BG==1)
break;
}
}
}
else
{
switch (R)
{
#endif
} else {
#ifndef UNROLL_BN2CN_PROC
nrLDPC_bn2cnProcBuf_BG2(p_lut, bnProcBufRes, cnProcBuf, Z);
#else
switch (R) {
case 15:
{
#ifdef __AVX512BW__
......@@ -439,7 +424,6 @@ if (BG==1)
break;
}
case 13:
{
#ifdef __AVX512BW__
nrLDPC_bnProc_BG2_R13_AVX512(bnProcBuf, bnProcBufRes,llrRes, Z);
......@@ -450,7 +434,6 @@ if (BG==1)
}
case 23:
{
#ifdef __AVX512BW__
nrLDPC_bnProc_BG2_R23_AVX512(bnProcBuf, bnProcBufRes,llrRes, Z);
......@@ -459,9 +442,8 @@ if (BG==1)
#endif
break;
}
}
#endif
}
#ifdef NR_LDPC_PROFILER_DETAIL
......@@ -477,14 +459,8 @@ if (BG==1)
#ifdef NR_LDPC_PROFILER_DETAIL
start_meas(&p_profiler->bn2cnProcBuf);
#endif
if (BG == 1)
{
nrLDPC_bn2cnProcBuf_BG1(p_lut, bnProcBufRes, cnProcBuf, Z);
}
else
{
nrLDPC_bn2cnProcBuf_BG2(p_lut, bnProcBufRes, cnProcBuf, Z);
}
if (BG == 1) nrLDPC_bn2cnProcBuf_BG1(p_lut, bnProcBufRes, cnProcBuf, Z);
else nrLDPC_bn2cnProcBuf_BG2(p_lut, bnProcBufRes, cnProcBuf, Z);
#ifdef NR_LDPC_PROFILER_DETAIL
stop_meas(&p_profiler->bn2cnProcBuf);
#endif
......@@ -499,8 +475,7 @@ if (BG==1)
// First iteration finished
while ( (i < numMaxIter) && (pcRes != 0) )
{
while ( (i < numMaxIter) && (pcRes != 0) ) {
// Increase iteration counter
i++;
......@@ -508,10 +483,11 @@ if (BG==1)
#ifdef NR_LDPC_PROFILER_DETAIL
start_meas(&p_profiler->cnProc);
#endif
if (BG==1)
{
switch (R)
{
if (BG==1) {
#ifndef UNROLL_CN_PROC
nrLDPC_cnProc_BG1(p_lut, cnProcBuf, cnProcBufRes, Z);
#else
switch (R) {
case 13:
{
#ifdef __AVX512BW__
......@@ -520,8 +496,7 @@ if (BG==1)
nrLDPC_cnProc_BG1_R13_AVX2(cnProcBuf, cnProcBufRes, Z);
#endif
break;
}
}
case 23:
{
#ifdef __AVX512BW__
......@@ -530,8 +505,7 @@ if (BG==1)
nrLDPC_cnProc_BG1_R23_AVX2(cnProcBuf, cnProcBufRes, Z);
#endif
break;
}
}
case 89:
{
#ifdef __AVX512BW__
......@@ -540,14 +514,14 @@ if (BG==1)
nrLDPC_cnProc_BG1_R89_AVX2(cnProcBuf, cnProcBufRes, Z);
#endif
break;
}
}
}
}
else
{
switch (R)
{
#endif
} else {
#ifndef UNROLL_CN_PROC
nrLDPC_cnProc_BG2(p_lut, cnProcBuf, cnProcBufRes, Z);
#else
switch (R) {
case 15:
{
#ifdef __AVX512BW__
......@@ -556,8 +530,7 @@ if (BG==1)
nrLDPC_cnProc_BG2_R15_AVX2(cnProcBuf, cnProcBufRes, Z);
#endif
break;
}
}
case 13:
{
#ifdef __AVX512BW__
......@@ -566,8 +539,7 @@ if (BG==1)
nrLDPC_cnProc_BG2_R13_AVX2(cnProcBuf, cnProcBufRes, Z);
#endif
break;
}
}
case 23:
{
#ifdef __AVX512BW__
......@@ -576,9 +548,10 @@ if (BG==1)
nrLDPC_cnProc_BG2_R23_AVX2(cnProcBuf, cnProcBufRes, Z);
#endif
break;
}
}
}
#endif
}
}
#ifdef NR_LDPC_PROFILER_DETAIL
stop_meas(&p_profiler->cnProc);
#endif
......@@ -591,14 +564,8 @@ if (BG==1)
#ifdef NR_LDPC_PROFILER_DETAIL
start_meas(&p_profiler->cn2bnProcBuf);
#endif
if (BG == 1)
{
nrLDPC_cn2bnProcBuf_BG1(p_lut, cnProcBufRes, bnProcBuf, Z);
}
else
{
nrLDPC_cn2bnProcBuf_BG2(p_lut, cnProcBufRes, bnProcBuf, Z);
}
if (BG == 1) nrLDPC_cn2bnProcBuf_BG1(p_lut, cnProcBufRes, bnProcBuf, Z);
else nrLDPC_cn2bnProcBuf_BG2(p_lut, cnProcBufRes, bnProcBuf, Z);
#ifdef NR_LDPC_PROFILER_DETAIL
stop_meas(&p_profiler->cn2bnProcBuf);
#endif
......@@ -612,11 +579,11 @@ if (BG==1)
start_meas(&p_profiler->bnProcPc);
#endif
//nrLDPC_bnProcPc(p_lut, p_procBuf, Z);
if (BG==1)
{
switch (R)
{
#ifndef UNROLL_BN_PROC_PC
nrLDPC_bnProcPc(p_lut, bnProcBuf, bnProcBufRes, llrProcBuf, llrRes, Z);
#else
if (BG==1) {
switch (R) {
case 13:
{
nrLDPC_bnProcPc_BG1_R13_AVX2(bnProcBuf,bnProcBufRes,llrRes, llrProcBuf, Z);
......@@ -632,36 +599,28 @@ if (BG==1)
nrLDPC_bnProcPc_BG1_R89_AVX2(bnProcBuf,bnProcBufRes, llrRes, llrProcBuf, Z);
break;
}
}
}
else
{
switch (R)
{
}
} else {
switch (R)
{
case 15:
{
nrLDPC_bnProcPc_BG2_R15_AVX2(bnProcBuf,bnProcBufRes,llrRes, llrProcBuf, Z);
break;
}
case 13:
case 13:
{
nrLDPC_bnProcPc_BG2_R13_AVX2(bnProcBuf,bnProcBufRes,llrRes, llrProcBuf, Z);
break;
}
case 23:
{
nrLDPC_bnProcPc_BG2_R23_AVX2(bnProcBuf,bnProcBufRes,llrRes, llrProcBuf, Z);
break;
}
}
}
}
#endif
#ifdef NR_LDPC_PROFILER_DETAIL
stop_meas(&p_profiler->bnProcPc);
#endif
......@@ -673,13 +632,11 @@ if (BG==1)
#ifdef NR_LDPC_PROFILER_DETAIL
start_meas(&p_profiler->bnProc);
#endif
// nrLDPC_bnProc(p_lut, p_procBuf, Z);
if (BG==1)
{
switch (R)
{
#ifndef UNROLL_BN_PROC
nrLDPC_bnProc(p_lut, bnProcBuf, bnProcBufRes, llrRes, Z);
#else
if (BG==1) {
switch (R) {
case 13:
{
#ifdef __AVX512BW__
......@@ -707,12 +664,10 @@ if (BG==1)
#endif
break;
}
}
}
else
{
switch (R)
{
}
} else {
switch (R)
{
case 15:
{
#ifdef __AVX512BW__
......@@ -723,7 +678,6 @@ if (BG==1)
break;
}
case 13:
{
#ifdef __AVX512BW__
nrLDPC_bnProc_BG2_R13_AVX512(bnProcBuf, bnProcBufRes,llrRes, Z);
......@@ -732,9 +686,7 @@ if (BG==1)
#endif
break;
}
case 23:
{
#ifdef __AVX512BW__
nrLDPC_bnProc_BG2_R23_AVX512(bnProcBuf, bnProcBufRes,llrRes, Z);
......@@ -743,10 +695,9 @@ if (BG==1)
#endif
break;
}
}
}
}
#endif
......@@ -762,14 +713,8 @@ if (BG==1)
#ifdef NR_LDPC_PROFILER_DETAIL
start_meas(&p_profiler->bn2cnProcBuf);
#endif
if (BG == 1)
{
nrLDPC_bn2cnProcBuf_BG1(p_lut, bnProcBufRes, cnProcBuf, Z);
}
else
{
nrLDPC_bn2cnProcBuf_BG2(p_lut, bnProcBufRes, cnProcBuf, Z);
}
if (BG == 1) nrLDPC_bn2cnProcBuf_BG1(p_lut, bnProcBufRes, cnProcBuf, Z);
else nrLDPC_bn2cnProcBuf_BG2(p_lut, bnProcBufRes, cnProcBuf, Z);
#ifdef NR_LDPC_PROFILER_DETAIL
stop_meas(&p_profiler->bn2cnProcBuf);
#endif
......@@ -778,360 +723,37 @@ if (BG==1)
nrLDPC_debug_writeBuffer2File(nrLDPC_buffers_CN_PROC, cnProcBuf);
#endif
// Parity Check
// Parity Check
#ifdef NR_LDPC_ENABLE_PARITY_CHECK
#ifdef NR_LDPC_PROFILER_DETAIL
start_meas(&p_profiler->cnProcPc);
start_meas(&p_profiler->cnProcPc);
#endif
if (BG == 1)
{
pcRes = nrLDPC_cnProcPc_BG1(p_lut, cnProcBuf, cnProcBufRes, Z);
}
else
{
pcRes = nrLDPC_cnProcPc_BG2(p_lut, cnProcBuf,cnProcBufRes, Z);
}
if (BG == 1) pcRes = nrLDPC_cnProcPc_BG1(p_lut, cnProcBuf, cnProcBufRes, Z);
else pcRes = nrLDPC_cnProcPc_BG2(p_lut, cnProcBuf, cnProcBufRes, Z);
#ifdef NR_LDPC_PROFILER_DETAIL
stop_meas(&p_profiler->cnProcPc);
#endif
stop_meas(&p_profiler->cnProcPc);
#endif
}
// Last iteration
if ( (i < numMaxIter) && (pcRes != 0) )
{
// Increase iteration counter
i++;
// CN processing
#ifdef NR_LDPC_PROFILER_DETAIL
start_meas(&p_profiler->cnProc);
#endif
if (BG==1)
{
switch (R)
{
case 13:
{
#ifdef __AVX512BW__
nrLDPC_cnProc_BG1_R13_AVX512( cnProcBuf, cnProcBufRes, Z);
#else
nrLDPC_cnProc_BG1_R13_AVX2( cnProcBuf, cnProcBufRes, Z);
#endif
break;
}
case 23:
{
#ifdef __AVX512BW__
nrLDPC_cnProc_BG1_R23_AVX512( cnProcBuf, cnProcBufRes, Z);
#else
nrLDPC_cnProc_BG1_R23_AVX2( cnProcBuf, cnProcBufRes, Z);
#endif
break;
}
case 89:
{
#ifdef __AVX512BW__
nrLDPC_cnProc_BG1_R89_AVX512( cnProcBuf, cnProcBufRes, Z);
#else
nrLDPC_cnProc_BG1_R89_AVX2( cnProcBuf, cnProcBufRes, Z);
#endif
break;
}
}
}
else
{
switch (R)
{
case 15:
{
#ifdef __AVX512BW__
nrLDPC_cnProc_BG2_R15_AVX512( cnProcBuf, cnProcBufRes, Z);
#else
nrLDPC_cnProc_BG2_R15_AVX2( cnProcBuf, cnProcBufRes, Z);
#endif
break;
}
case 13:
{
#ifdef __AVX512BW__
nrLDPC_cnProc_BG2_R13_AVX512( cnProcBuf, cnProcBufRes, Z);
#else
nrLDPC_cnProc_BG2_R13_AVX2( cnProcBuf, cnProcBufRes, Z);
#endif
break;
}
case 23:
{
#ifdef __AVX512BW__
nrLDPC_cnProc_BG2_R23_AVX512( cnProcBuf, cnProcBufRes, Z);
#else
nrLDPC_cnProc_BG2_R23_AVX2( cnProcBuf, cnProcBufRes, Z);
#endif
break;
}
}
}
#ifdef NR_LDPC_PROFILER_DETAIL
stop_meas(&p_profiler->cnProc);
#endif
#ifdef NR_LDPC_DEBUG_MODE
nrLDPC_debug_writeBuffer2File(nrLDPC_buffers_CN_PROC_RES, p_procBuf);
#endif
// Send CN results back to BNs
#ifdef NR_LDPC_PROFILER_DETAIL
start_meas(&p_profiler->cn2bnProcBuf);
#endif
if (BG == 1)
{
nrLDPC_cn2bnProcBuf_BG1(p_lut, cnProcBufRes, bnProcBuf, Z);
}
else
{
nrLDPC_cn2bnProcBuf_BG2(p_lut, cnProcBufRes, bnProcBuf, Z);
}
#ifdef NR_LDPC_PROFILER_DETAIL
stop_meas(&p_profiler->cn2bnProcBuf);
#endif
#ifdef NR_LDPC_DEBUG_MODE
nrLDPC_debug_writeBuffer2File(nrLDPC_buffers_BN_PROC, p_procBuf);
#endif
// BN Processing
#ifdef NR_LDPC_PROFILER_DETAIL
start_meas(&p_profiler->bnProcPc);
#endif
// nrLDPC_bnProcPc(p_lut, p_procBuf, Z);
if (BG==1)
{
switch (R)
{
case 13:
{
nrLDPC_bnProcPc_BG1_R13_AVX2( bnProcBuf,bnProcBufRes,llrRes, llrProcBuf, Z);
break;
}
case 23:
{
nrLDPC_bnProcPc_BG1_R23_AVX2( bnProcBuf,bnProcBufRes, llrRes, llrProcBuf, Z);
break;
}
case 89:
{
nrLDPC_bnProcPc_BG1_R89_AVX2( bnProcBuf,bnProcBufRes, llrRes, llrProcBuf, Z);
break;
}
}
}
else
{
switch (R)
{
case 15:
{
nrLDPC_bnProcPc_BG2_R15_AVX2( bnProcBuf,bnProcBufRes, llrRes, llrProcBuf, Z);
break;
}
case 13:
{
nrLDPC_bnProcPc_BG2_R13_AVX2( bnProcBuf,bnProcBufRes,llrRes, llrProcBuf, Z);
break;
}
case 23:
{
nrLDPC_bnProcPc_BG2_R23_AVX2( bnProcBuf,bnProcBufRes,llrRes, llrProcBuf, Z);
break;
}
}
}
#ifdef NR_LDPC_PROFILER_DETAIL
stop_meas(&p_profiler->bnProcPc);
#endif
#ifdef NR_LDPC_DEBUG_MODE
nrLDPC_debug_writeBuffer2File(nrLDPC_buffers_LLR_RES, p_procBuf);
#endif
// If parity check not enabled, no need to send the BN proc results
// back to CNs
#ifdef NR_LDPC_ENABLE_PARITY_CHECK
#ifdef NR_LDPC_PROFILER_DETAIL
start_meas(&p_profiler->bnProc);
#endif
//nrLDPC_bnProc(p_lut, p_procBuf, Z);
if (BG==1)
{
switch (R)
{
case 13:
{
#ifdef __AVX512BW__
nrLDPC_bnProc_BG1_R13_AVX512( bnProcBuf, bnProcBufRes,llrRes, Z);
#else
nrLDPC_bnProc_BG1_R13_AVX2( bnProcBuf, bnProcBufRes,llrRes, Z);
#endif
break;
}
case 23:
{
#ifdef __AVX512BW__
nrLDPC_bnProc_BG1_R23_AVX512( bnProcBuf, bnProcBufRes,llrRes, Z);
#else
nrLDPC_bnProc_BG1_R23_AVX2( bnProcBuf, bnProcBufRes,llrRes, Z);
#endif
break;
}
case 89:
{
#ifdef __AVX512BW__
nrLDPC_bnProc_BG1_R89_AVX512( bnProcBuf, bnProcBufRes,llrRes, Z);
#else
nrLDPC_bnProc_BG1_R89_AVX2( bnProcBuf, bnProcBufRes,llrRes, Z);
#endif
break;
}
}
}
else
{
switch (R)
{
case 15:
{
#ifdef __AVX512BW__
nrLDPC_bnProc_BG2_R15_AVX512( bnProcBuf, bnProcBufRes,llrRes, Z);
#else
nrLDPC_bnProc_BG2_R15_AVX2( bnProcBuf, bnProcBufRes,llrRes, Z);
#endif
break;
}
case 13:
{
#ifdef __AVX512BW__
nrLDPC_bnProc_BG2_R13_AVX512( bnProcBuf, bnProcBufRes,llrRes, Z);
#else
nrLDPC_bnProc_BG2_R13_AVX2( bnProcBuf, bnProcBufRes,llrRes, Z);
#endif
break;
}
case 23:
{
#ifdef __AVX512BW__
nrLDPC_bnProc_BG2_R23_AVX512( bnProcBuf, bnProcBufRes,llrRes, Z);
#else
nrLDPC_bnProc_BG2_R23_AVX2( bnProcBuf, bnProcBufRes,llrRes, Z);
#endif
break;
}
}
}
#ifdef NR_LDPC_PROFILER_DETAIL
stop_meas(&p_profiler->bnProc);
#endif
#ifdef NR_LDPC_DEBUG_MODE
nrLDPC_debug_writeBuffer2File(nrLDPC_buffers_BN_PROC_RES, p_procBuf);
#endif
// BN results to CN processing buffer
#ifdef NR_LDPC_PROFILER_DETAIL
start_meas(&p_profiler->bn2cnProcBuf);
#endif
if (BG == 1)
{
nrLDPC_bn2cnProcBuf_BG1(p_lut, bnProcBufRes, cnProcBuf, Z);
}
else
{
nrLDPC_bn2cnProcBuf_BG2(p_lut, bnProcBufRes, cnProcBuf, Z);
}
#ifdef NR_LDPC_PROFILER_DETAIL
stop_meas(&p_profiler->bn2cnProcBuf);
#endif
#ifdef NR_LDPC_DEBUG_MODE
nrLDPC_debug_writeBuffer2File(nrLDPC_buffers_CN_PROC, p_procBuf);
#endif
// Parity Check
#ifdef NR_LDPC_PROFILER_DETAIL
start_meas(&p_profiler->cnProcPc);
#endif
if (BG == 1)
{
pcRes = nrLDPC_cnProcPc_BG1(p_lut, cnProcBuf, cnProcBufRes, Z);
}
else
{
pcRes = nrLDPC_cnProcPc_BG2(p_lut, cnProcBuf, cnProcBufRes, Z);
}
#ifdef NR_LDPC_PROFILER_DETAIL
stop_meas(&p_profiler->cnProcPc);
#endif
#endif
}
// If maximum number of iterations reached an PC still fails increase number of iterations
// Thus, i > numMaxIter indicates that PC has failed
#ifdef NR_LDPC_ENABLE_PARITY_CHECK
if (pcRes != 0)
{
i++;
}
#endif
} // end while
// Last iteration
if (pcRes != 0) i++;
// Assign results from processing buffer to output
#ifdef NR_LDPC_PROFILER_DETAIL
start_meas(&p_profiler->llrRes2llrOut);
start_meas(&p_profiler->llrRes2llrOut);
#endif
nrLDPC_llrRes2llrOut(p_lut, p_llrOut, llrRes, Z, BG);
nrLDPC_llrRes2llrOut(p_lut, p_llrOut, llrRes, Z, BG);
#ifdef NR_LDPC_PROFILER_DETAIL
stop_meas(&p_profiler->llrRes2llrOut);
stop_meas(&p_profiler->llrRes2llrOut);
#endif
// Hard-decision
#ifdef NR_LDPC_PROFILER_DETAIL
start_meas(&p_profiler->llr2bit);
start_meas(&p_profiler->llr2bit);
#endif
if (outMode == nrLDPC_outMode_BIT)
{
nrLDPC_llr2bitPacked(p_out, p_llrOut, numLLR);
}
else if (outMode == nrLDPC_outMode_BITINT8)
{
nrLDPC_llr2bit(p_out, p_llrOut, numLLR);
}
if (outMode == nrLDPC_outMode_BIT) nrLDPC_llr2bitPacked(p_out, p_llrOut, numLLR);
else //if (outMode == nrLDPC_outMode_BITINT8)
nrLDPC_llr2bit(p_out, p_llrOut, numLLR);
#ifdef NR_LDPC_PROFILER_DETAIL
stop_meas(&p_profiler->llr2bit);
#endif
......
......@@ -41,15 +41,6 @@
\param Z Lifting size
\param cshift Circular shift
*/
//more faster memcpy by using "rep movsb", which on modern processors is highly optimized
void *memcpy1(void *dst, const void *src, size_t n)
{
void *ret = dst;
asm volatile("rep movsb" : "+D" (dst) : "c"(n), "S"(src) : "cc", "memory");
return ret;
}
static inline void *nrLDPC_inv_circ_memcpy(int8_t *str1, const int8_t *str2, uint16_t Z, uint16_t cshift)
{
......
/*
* Licensed to the OpenAirInterface (OAI) Software Alliance under one or more
* contributor license agreements. See the NOTICE file distributed with
* this work for additional information regarding copyright ownership.
* The OpenAirInterface Software Alliance licenses this file to You under
* the OAI Public License, Version 1.1 (the "License"); you may not use this file
* except in compliance with the License.
* You may obtain a copy of the License at
*
* http://www.openairinterface.org/?page_id=698
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*-------------------------------------------------------------------------------
* For more information about the OpenAirInterface (OAI) Software Alliance:
* contact@openairinterface.org
*/
/*!\file nrLDPC_mPass.h
* \brief Defines the functions for message passing
* \author Sebastian Wagner (TCL Communications) Email: <mailto:sebastian.wagner@tcl.com>
* \date 30-09-2019
* \version 2.0
* \note
* \warning
*/
#ifndef __NR_LDPC_MPASS__H__
#define __NR_LDPC_MPASS__H__
#include <string.h>
#include "nrLDPCdecoder_defs.h"
//#include <omp.h>
/**
\brief Circular memcpy
|<- rem->|<- circular shift ->|
(src) str2 = |--------xxxxxxxxxxxxxxxxxxxxx|
\_______________
\
(dst) str1 = |xxxxxxxxxxxxxxxxxxxxx---------|
\param str1 Pointer to the start of the destination buffer
\param str2 Pointer to the source buffer
\param Z Lifting size
\param cshift Circular shift
*/
static inline void *nrLDPC_inv_circ_memcpy(int8_t *str1, const int8_t *str2, uint16_t Z, uint16_t cshift)
{
uint16_t rem = Z - cshift;
memcpy(str1+cshift, str2 , rem);
memcpy(str1 , str2+rem, cshift);
return(str1);
}
/**
\brief Inverse circular memcpy
|<- circular shift ->|<- rem->|
(src) str2 = |xxxxxxxxxxxxxxxxxxxx\--------|
\
(dst) str1 = |--------xxxxxxxxxxxxxxxxxxxxx|
\param str1 Pointer to the start of the destination buffer
\param str2 Pointer to the source buffer
\param Z Lifting size
\param cshift Circular shift
*/
static inline void *nrLDPC_circ_memcpy(int8_t *str1, const int8_t *str2, uint16_t Z, uint16_t cshift)
{
uint16_t rem = Z - cshift;
memcpy(str1 , str2+cshift, rem);
memcpy(str1+rem , str2 , cshift);
return(str1);
}
/**
\brief Copies the input LLRs to their corresponding place in the LLR processing buffer.
Example: BG2
| 0| 0| LLRs --> |
BN Groups |22|23|10| 5| 5|14| 7|13| 6| 8| 9|16| 9|12|1|1|...|1|
^---------------------------------------/---- /
_________________________/ | /
/ ____________________________|___/
/ / \
LLR Proc Buffer (BNG) | 1| 5| 6| 7| 8| 9|10|12|13|14|16|22|23|
Number BN in BNG(R15) |38| 2| 1| 1| 1| 2| 1| 1| 1| 1| 1| 1| 1|
Idx: 0 ^ ^ ^
38*384=14592 _____| ... | |
50*384=19200 ----------------------------------- |
51*384=19584 --------------------------------------
\param p_lut Pointer to decoder LUTs
\param llr Pointer to input LLRs
\param p_procBuf Pointer the processing buffers
\param Z Lifting size
\param BG Base graph
*/
static inline void nrLDPC_llr2llrProcBuf(t_nrLDPC_lut* p_lut, int8_t* llr, t_nrLDPC_procBuf* p_procBuf, uint16_t Z, uint8_t BG)
{
uint32_t i;
const uint8_t numBn2CnG1 = p_lut->numBnInBnGroups[0];
uint32_t startColParity = (BG ==1 ) ? (NR_LDPC_START_COL_PARITY_BG1) : (NR_LDPC_START_COL_PARITY_BG2);
uint32_t colG1 = startColParity*Z;
const uint16_t* lut_llr2llrProcBufAddr = p_lut->llr2llrProcBufAddr;
const uint8_t* lut_llr2llrProcBufBnPos = p_lut->llr2llrProcBufBnPos;
uint32_t idxBn;
int8_t* llrProcBuf = p_procBuf->llrProcBuf;
// Copy LLRs connected to 1 CN
if (numBn2CnG1 > 0)
{
memcpy(&llrProcBuf[0], &llr[colG1], numBn2CnG1*Z);
}
// First 2 columns might be set to zero directly if it's true they always belong to the groups with highest number of connected CNs...
for (i=0; i<startColParity; i++)
{
idxBn = lut_llr2llrProcBufAddr[i] + lut_llr2llrProcBufBnPos[i]*Z;
memcpy(&llrProcBuf[idxBn], llr, Z);
llr += Z;
}
}
/**
\brief Copies the input LLRs to their corresponding place in the CN processing buffer for BG1.
\param p_lut Pointer to decoder LUTs
\param llr Pointer to input LLRs
\param p_procBuf Pointer to the processing buffers
\param Z Lifting size
*/
static inline void nrLDPC_llr2CnProcBuf_BG1(t_nrLDPC_lut* p_lut, int8_t* llr, t_nrLDPC_procBuf* p_procBuf, uint16_t Z)
{
const uint16_t (*lut_circShift_CNG3) [lut_numCnInCnGroups_BG1_R13[0]] = (uint16_t(*)[lut_numCnInCnGroups_BG1_R13[0]]) p_lut->circShift[0];
const uint16_t (*lut_circShift_CNG4) [lut_numCnInCnGroups_BG1_R13[1]] = (uint16_t(*)[lut_numCnInCnGroups_BG1_R13[1]]) p_lut->circShift[1];
const uint16_t (*lut_circShift_CNG5) [lut_numCnInCnGroups_BG1_R13[2]] = (uint16_t(*)[lut_numCnInCnGroups_BG1_R13[2]]) p_lut->circShift[2];
const uint16_t (*lut_circShift_CNG6) [lut_numCnInCnGroups_BG1_R13[3]] = (uint16_t(*)[lut_numCnInCnGroups_BG1_R13[3]]) p_lut->circShift[3];
const uint16_t (*lut_circShift_CNG7) [lut_numCnInCnGroups_BG1_R13[4]] = (uint16_t(*)[lut_numCnInCnGroups_BG1_R13[4]]) p_lut->circShift[4];
const uint16_t (*lut_circShift_CNG8) [lut_numCnInCnGroups_BG1_R13[5]] = (uint16_t(*)[lut_numCnInCnGroups_BG1_R13[5]]) p_lut->circShift[5];
const uint16_t (*lut_circShift_CNG9) [lut_numCnInCnGroups_BG1_R13[6]] = (uint16_t(*)[lut_numCnInCnGroups_BG1_R13[6]]) p_lut->circShift[6];
const uint16_t (*lut_circShift_CNG10)[lut_numCnInCnGroups_BG1_R13[7]] = (uint16_t(*)[lut_numCnInCnGroups_BG1_R13[7]]) p_lut->circShift[7];
const uint16_t (*lut_circShift_CNG19)[lut_numCnInCnGroups_BG1_R13[8]] = (uint16_t(*)[lut_numCnInCnGroups_BG1_R13[8]]) p_lut->circShift[8];
const uint8_t (*lut_posBnInCnProcBuf_CNG3) [lut_numCnInCnGroups_BG1_R13[0]] = (uint8_t(*)[lut_numCnInCnGroups_BG1_R13[0]]) p_lut->posBnInCnProcBuf[0];
const uint8_t (*lut_posBnInCnProcBuf_CNG4) [lut_numCnInCnGroups_BG1_R13[1]] = (uint8_t(*)[lut_numCnInCnGroups_BG1_R13[1]]) p_lut->posBnInCnProcBuf[1];
const uint8_t (*lut_posBnInCnProcBuf_CNG5) [lut_numCnInCnGroups_BG1_R13[2]] = (uint8_t(*)[lut_numCnInCnGroups_BG1_R13[2]]) p_lut->posBnInCnProcBuf[2];
const uint8_t (*lut_posBnInCnProcBuf_CNG6) [lut_numCnInCnGroups_BG1_R13[3]] = (uint8_t(*)[lut_numCnInCnGroups_BG1_R13[3]]) p_lut->posBnInCnProcBuf[3];
const uint8_t (*lut_posBnInCnProcBuf_CNG7) [lut_numCnInCnGroups_BG1_R13[4]] = (uint8_t(*)[lut_numCnInCnGroups_BG1_R13[4]]) p_lut->posBnInCnProcBuf[4];
const uint8_t (*lut_posBnInCnProcBuf_CNG8) [lut_numCnInCnGroups_BG1_R13[5]] = (uint8_t(*)[lut_numCnInCnGroups_BG1_R13[5]]) p_lut->posBnInCnProcBuf[5];
const uint8_t (*lut_posBnInCnProcBuf_CNG9) [lut_numCnInCnGroups_BG1_R13[6]] = (uint8_t(*)[lut_numCnInCnGroups_BG1_R13[6]]) p_lut->posBnInCnProcBuf[6];
const uint8_t (*lut_posBnInCnProcBuf_CNG10)[lut_numCnInCnGroups_BG1_R13[7]] = (uint8_t(*)[lut_numCnInCnGroups_BG1_R13[7]]) p_lut->posBnInCnProcBuf[7];
const uint8_t (*lut_posBnInCnProcBuf_CNG19)[lut_numCnInCnGroups_BG1_R13[8]] = (uint8_t(*)[lut_numCnInCnGroups_BG1_R13[8]]) p_lut->posBnInCnProcBuf[8];
const uint8_t* lut_numCnInCnGroups = p_lut->numCnInCnGroups;
const uint32_t* lut_startAddrCnGroups = p_lut->startAddrCnGroups;
int8_t* cnProcBuf = p_procBuf->cnProcBuf;
uint32_t i;
uint32_t j;
uint32_t idxBn = 0;
int8_t* p_cnProcBuf;
uint32_t bitOffsetInGroup;
// =====================================================================
// CN group with 3 BNs
bitOffsetInGroup = lut_numCnInCnGroups_BG1_R13[0]*NR_LDPC_ZMAX;
// #pragma omp simd
// #pragma omp parallel for schedule(dynamic)
for (j=0; j<3; j++)
{
p_cnProcBuf = &cnProcBuf[lut_startAddrCnGroups[0] + j*bitOffsetInGroup];
idxBn = lut_posBnInCnProcBuf_CNG3[j][0]*Z;
nrLDPC_circ_memcpy(p_cnProcBuf, &llr[idxBn], Z, lut_circShift_CNG3[j][0]);
}
// =====================================================================
// CN group with 4 BNs
bitOffsetInGroup = lut_numCnInCnGroups_BG1_R13[1]*NR_LDPC_ZMAX;
for (j=0; j<4; j++)
{
p_cnProcBuf = &cnProcBuf[lut_startAddrCnGroups[1] + j*bitOffsetInGroup];
for (i=0; i<lut_numCnInCnGroups[1]; i++)
{
idxBn = lut_posBnInCnProcBuf_CNG4[j][i]*Z;
nrLDPC_circ_memcpy(p_cnProcBuf, &llr[idxBn], Z, lut_circShift_CNG4[j][i]);
p_cnProcBuf += Z;
}
}
// =====================================================================
// CN group with 5 BNs
bitOffsetInGroup = lut_numCnInCnGroups_BG1_R13[2]*NR_LDPC_ZMAX;
for (j=0; j<5; j++)
{
p_cnProcBuf = &cnProcBuf[lut_startAddrCnGroups[2] + j*bitOffsetInGroup];
for (i=0; i<lut_numCnInCnGroups[2]; i++)
{
idxBn = lut_posBnInCnProcBuf_CNG5[j][i]*Z;
nrLDPC_circ_memcpy(p_cnProcBuf, &llr[idxBn], Z, lut_circShift_CNG5[j][i]);
p_cnProcBuf += Z;
}
}
// =====================================================================
// CN group with 6 BNs
bitOffsetInGroup = lut_numCnInCnGroups_BG1_R13[3]*NR_LDPC_ZMAX;
for (j=0; j<6; j++)
{
p_cnProcBuf = &cnProcBuf[lut_startAddrCnGroups[3] + j*bitOffsetInGroup];
for (i=0; i<lut_numCnInCnGroups[3]; i++)
{
idxBn = lut_posBnInCnProcBuf_CNG6[j][i]*Z;
nrLDPC_circ_memcpy(p_cnProcBuf, &llr[idxBn], Z, lut_circShift_CNG6[j][i]);
p_cnProcBuf += Z;
}
}
// =====================================================================
// CN group with 7 BNs
bitOffsetInGroup = lut_numCnInCnGroups_BG1_R13[4]*NR_LDPC_ZMAX;
for (j=0; j<7; j++)
{
p_cnProcBuf = &cnProcBuf[lut_startAddrCnGroups[4] + j*bitOffsetInGroup];
for (i=0; i<lut_numCnInCnGroups[4]; i++)
{
idxBn = lut_posBnInCnProcBuf_CNG7[j][i]*Z;
nrLDPC_circ_memcpy(p_cnProcBuf, &llr[idxBn], Z, lut_circShift_CNG7[j][i]);
p_cnProcBuf += Z;
}
}
// =====================================================================
// CN group with 8 BNs
bitOffsetInGroup = lut_numCnInCnGroups_BG1_R13[5]*NR_LDPC_ZMAX;
for (j=0; j<8; j++)
{
p_cnProcBuf = &cnProcBuf[lut_startAddrCnGroups[5] + j*bitOffsetInGroup];
for (i=0; i<lut_numCnInCnGroups[5]; i++)
{
idxBn = lut_posBnInCnProcBuf_CNG8[j][i]*Z;
nrLDPC_circ_memcpy(p_cnProcBuf, &llr[idxBn], Z, lut_circShift_CNG8[j][i]);
p_cnProcBuf += Z;
}
}
// =====================================================================
// CN group with 9 BNs
bitOffsetInGroup = lut_numCnInCnGroups_BG1_R13[6]*NR_LDPC_ZMAX;
for (j=0; j<9; j++)
{
p_cnProcBuf = &cnProcBuf[lut_startAddrCnGroups[6] + j*bitOffsetInGroup];
for (i=0; i<lut_numCnInCnGroups[6]; i++)
{
idxBn = lut_posBnInCnProcBuf_CNG9[j][i]*Z;
nrLDPC_circ_memcpy(p_cnProcBuf, &llr[idxBn], Z, lut_circShift_CNG9[j][i]);
p_cnProcBuf += Z;
}
}
// =====================================================================
// CN group with 10 BNs
bitOffsetInGroup = lut_numCnInCnGroups_BG1_R13[7]*NR_LDPC_ZMAX;
for (j=0; j<10; j++)
{
p_cnProcBuf = &cnProcBuf[lut_startAddrCnGroups[7] + j*bitOffsetInGroup];
for (i=0; i<lut_numCnInCnGroups[7]; i++)
{
idxBn = lut_posBnInCnProcBuf_CNG10[j][i]*Z;
nrLDPC_circ_memcpy(p_cnProcBuf, &llr[idxBn], Z, lut_circShift_CNG10[j][i]);
p_cnProcBuf += Z;
}
}
// =====================================================================
// CN group with 19 BNs
bitOffsetInGroup = lut_numCnInCnGroups_BG1_R13[8]*NR_LDPC_ZMAX;
for (j=0; j<19; j++)
{
p_cnProcBuf = &cnProcBuf[lut_startAddrCnGroups[8] + j*bitOffsetInGroup];
for (i=0; i<lut_numCnInCnGroups[8]; i++)
{
idxBn = lut_posBnInCnProcBuf_CNG19[j][i]*Z;
nrLDPC_circ_memcpy(p_cnProcBuf, &llr[idxBn], Z, lut_circShift_CNG19[j][i]);
p_cnProcBuf += Z;
}
}
}
/**
\brief Copies the input LLRs to their corresponding place in the CN processing buffer for BG2.
Example: BG2
| 0| 0| LLRs --> |
BN Groups |22|23|10| 5| 5|14| 7|13| 6| 8| 9|16| 9|12|1|1|...|1|
CN Processing Buffer (CNGs) | 3| 4| 5| 6| 8|10|
Number of CN per CNG (R15) | 6|20| 9| 3| 2| 2|
0 ^ ^\ \
3*6*384=6912 _________| || \_____________
(3*6+4*20+5*9)*384=54912____|| \
Bit | 1| 2| 3| 4| 5| 6|
3*Z CNs>| |<
^
54912 + 3*384______|
\param p_lut Pointer to decoder LUTs
\param llr Pointer to input LLRs
\param p_procBuf Pointer to the processing buffers
\param Z Lifting size
*/
static inline void nrLDPC_llr2CnProcBuf_BG2(t_nrLDPC_lut* p_lut, int8_t* llr, t_nrLDPC_procBuf* p_procBuf, uint16_t Z)
{
const uint16_t (*lut_circShift_CNG3) [lut_numCnInCnGroups_BG2_R15[0]] = (uint16_t(*)[lut_numCnInCnGroups_BG2_R15[0]]) p_lut->circShift[0];
const uint16_t (*lut_circShift_CNG4) [lut_numCnInCnGroups_BG2_R15[1]] = (uint16_t(*)[lut_numCnInCnGroups_BG2_R15[1]]) p_lut->circShift[1];
const uint16_t (*lut_circShift_CNG5) [lut_numCnInCnGroups_BG2_R15[2]] = (uint16_t(*)[lut_numCnInCnGroups_BG2_R15[2]]) p_lut->circShift[2];
const uint16_t (*lut_circShift_CNG6) [lut_numCnInCnGroups_BG2_R15[3]] = (uint16_t(*)[lut_numCnInCnGroups_BG2_R15[3]]) p_lut->circShift[3];
const uint16_t (*lut_circShift_CNG8) [lut_numCnInCnGroups_BG2_R15[4]] = (uint16_t(*)[lut_numCnInCnGroups_BG2_R15[4]]) p_lut->circShift[4];
const uint16_t (*lut_circShift_CNG10) [lut_numCnInCnGroups_BG2_R15[5]] = (uint16_t(*)[lut_numCnInCnGroups_BG2_R15[5]]) p_lut->circShift[5];
const uint8_t (*lut_posBnInCnProcBuf_CNG3) [lut_numCnInCnGroups_BG2_R15[0]] = (uint8_t(*)[lut_numCnInCnGroups_BG2_R15[0]]) p_lut->posBnInCnProcBuf[0];
const uint8_t (*lut_posBnInCnProcBuf_CNG4) [lut_numCnInCnGroups_BG2_R15[1]] = (uint8_t(*)[lut_numCnInCnGroups_BG2_R15[1]]) p_lut->posBnInCnProcBuf[1];
const uint8_t (*lut_posBnInCnProcBuf_CNG5) [lut_numCnInCnGroups_BG2_R15[2]] = (uint8_t(*)[lut_numCnInCnGroups_BG2_R15[2]]) p_lut->posBnInCnProcBuf[2];
const uint8_t (*lut_posBnInCnProcBuf_CNG6) [lut_numCnInCnGroups_BG2_R15[3]] = (uint8_t(*)[lut_numCnInCnGroups_BG2_R15[3]]) p_lut->posBnInCnProcBuf[3];
const uint8_t (*lut_posBnInCnProcBuf_CNG8) [lut_numCnInCnGroups_BG2_R15[4]] = (uint8_t(*)[lut_numCnInCnGroups_BG2_R15[4]]) p_lut->posBnInCnProcBuf[4];
const uint8_t (*lut_posBnInCnProcBuf_CNG10) [lut_numCnInCnGroups_BG2_R15[5]] = (uint8_t(*)[lut_numCnInCnGroups_BG2_R15[5]]) p_lut->posBnInCnProcBuf[5];
const uint8_t* lut_numCnInCnGroups = p_lut->numCnInCnGroups;
const uint32_t* lut_startAddrCnGroups = p_lut->startAddrCnGroups;
int8_t* cnProcBuf = p_procBuf->cnProcBuf;
uint32_t i;
uint32_t j;
uint32_t idxBn = 0;
int8_t* p_cnProcBuf;
uint32_t bitOffsetInGroup;
// =====================================================================
// CN group with 3 BNs
bitOffsetInGroup = lut_numCnInCnGroups_BG2_R15[0]*NR_LDPC_ZMAX;
for (j=0; j<3; j++)
{
p_cnProcBuf = &cnProcBuf[lut_startAddrCnGroups[0] + j*bitOffsetInGroup];
for (i=0; i<lut_numCnInCnGroups[0]; i++)
{
idxBn = lut_posBnInCnProcBuf_CNG3[j][i]*Z;
nrLDPC_circ_memcpy(p_cnProcBuf, &llr[idxBn], Z, lut_circShift_CNG3[j][i]);
p_cnProcBuf += Z;
}
}
// =====================================================================
// CN group with 4 BNs
bitOffsetInGroup = lut_numCnInCnGroups_BG2_R15[1]*NR_LDPC_ZMAX;
for (j=0; j<4; j++)
{
p_cnProcBuf = &cnProcBuf[lut_startAddrCnGroups[1] + j*bitOffsetInGroup];
for (i=0; i<lut_numCnInCnGroups[1]; i++)
{
idxBn = lut_posBnInCnProcBuf_CNG4[j][i]*Z;
nrLDPC_circ_memcpy(p_cnProcBuf, &llr[idxBn], Z, lut_circShift_CNG4[j][i]);
p_cnProcBuf += Z;
}
}
// =====================================================================
// CN group with 5 BNs
bitOffsetInGroup = lut_numCnInCnGroups_BG2_R15[2]*NR_LDPC_ZMAX;
for (j=0; j<5; j++)
{
p_cnProcBuf = &cnProcBuf[lut_startAddrCnGroups[2] + j*bitOffsetInGroup];
for (i=0; i<lut_numCnInCnGroups[2]; i++)
{
idxBn = lut_posBnInCnProcBuf_CNG5[j][i]*Z;
nrLDPC_circ_memcpy(p_cnProcBuf, &llr[idxBn], Z, lut_circShift_CNG5[j][i]);
p_cnProcBuf += Z;
}
}
// =====================================================================
// CN group with 6 BNs
bitOffsetInGroup = lut_numCnInCnGroups_BG2_R15[3]*NR_LDPC_ZMAX;
for (j=0; j<6; j++)
{
p_cnProcBuf = &cnProcBuf[lut_startAddrCnGroups[3] + j*bitOffsetInGroup];
for (i=0; i<lut_numCnInCnGroups[3]; i++)
{
idxBn = lut_posBnInCnProcBuf_CNG6[j][i]*Z;
nrLDPC_circ_memcpy(p_cnProcBuf, &llr[idxBn], Z, lut_circShift_CNG6[j][i]);
p_cnProcBuf += Z;
}
}
// =====================================================================
// CN group with 8 BNs
bitOffsetInGroup = lut_numCnInCnGroups_BG2_R15[4]*NR_LDPC_ZMAX;
for (j=0; j<8; j++)
{
p_cnProcBuf = &cnProcBuf[lut_startAddrCnGroups[4] + j*bitOffsetInGroup];
for (i=0; i<lut_numCnInCnGroups[4]; i++)
{
idxBn = lut_posBnInCnProcBuf_CNG8[j][i]*Z;
nrLDPC_circ_memcpy(p_cnProcBuf, &llr[idxBn], Z, lut_circShift_CNG8[j][i]);
p_cnProcBuf += Z;
}
}
// =====================================================================
// CN group with 10 BNs
bitOffsetInGroup = lut_numCnInCnGroups_BG2_R15[5]*NR_LDPC_ZMAX;
for (j=0; j<10; j++)
{
p_cnProcBuf = &cnProcBuf[lut_startAddrCnGroups[5] + j*bitOffsetInGroup];
for (i=0; i<lut_numCnInCnGroups[5]; i++)
{
idxBn = lut_posBnInCnProcBuf_CNG10[j][i]*Z;
nrLDPC_circ_memcpy(p_cnProcBuf, &llr[idxBn], Z, lut_circShift_CNG10[j][i]);
p_cnProcBuf += Z;
}
}
}
/**
\brief Copies the values in the CN processing results buffer to their corresponding place in the BN processing buffer for BG2.
\param p_lut Pointer to decoder LUTs
\param p_procBuf Pointer to the processing buffers
\param Z Lifting size
*/
static inline void nrLDPC_cn2bnProcBuf_BG2(t_nrLDPC_lut* p_lut, t_nrLDPC_procBuf* p_procBuf, uint16_t Z)
{
const uint8_t* lut_numCnInCnGroups = p_lut->numCnInCnGroups;
const uint32_t* lut_startAddrCnGroups = p_lut->startAddrCnGroups;
const uint16_t (*lut_circShift_CNG3) [lut_numCnInCnGroups_BG2_R15[0]] = (uint16_t(*)[lut_numCnInCnGroups_BG2_R15[0]]) p_lut->circShift[0];
const uint16_t (*lut_circShift_CNG4) [lut_numCnInCnGroups_BG2_R15[1]] = (uint16_t(*)[lut_numCnInCnGroups_BG2_R15[1]]) p_lut->circShift[1];
const uint16_t (*lut_circShift_CNG5) [lut_numCnInCnGroups_BG2_R15[2]] = (uint16_t(*)[lut_numCnInCnGroups_BG2_R15[2]]) p_lut->circShift[2];
const uint16_t (*lut_circShift_CNG6) [lut_numCnInCnGroups_BG2_R15[3]] = (uint16_t(*)[lut_numCnInCnGroups_BG2_R15[3]]) p_lut->circShift[3];
const uint16_t (*lut_circShift_CNG8) [lut_numCnInCnGroups_BG2_R15[4]] = (uint16_t(*)[lut_numCnInCnGroups_BG2_R15[4]]) p_lut->circShift[4];
const uint16_t (*lut_circShift_CNG10) [lut_numCnInCnGroups_BG2_R15[5]] = (uint16_t(*)[lut_numCnInCnGroups_BG2_R15[5]]) p_lut->circShift[5];
const uint32_t (*lut_startAddrBnProcBuf_CNG3) [lut_numCnInCnGroups[0]] = (uint32_t(*)[lut_numCnInCnGroups[0]]) p_lut->startAddrBnProcBuf[0];
const uint32_t (*lut_startAddrBnProcBuf_CNG4) [lut_numCnInCnGroups[1]] = (uint32_t(*)[lut_numCnInCnGroups[1]]) p_lut->startAddrBnProcBuf[1];
const uint32_t (*lut_startAddrBnProcBuf_CNG5) [lut_numCnInCnGroups[2]] = (uint32_t(*)[lut_numCnInCnGroups[2]]) p_lut->startAddrBnProcBuf[2];
const uint32_t (*lut_startAddrBnProcBuf_CNG6) [lut_numCnInCnGroups[3]] = (uint32_t(*)[lut_numCnInCnGroups[3]]) p_lut->startAddrBnProcBuf[3];
const uint32_t (*lut_startAddrBnProcBuf_CNG8) [lut_numCnInCnGroups[4]] = (uint32_t(*)[lut_numCnInCnGroups[4]]) p_lut->startAddrBnProcBuf[4];
const uint32_t (*lut_startAddrBnProcBuf_CNG10) [lut_numCnInCnGroups[5]] = (uint32_t(*)[lut_numCnInCnGroups[5]]) p_lut->startAddrBnProcBuf[5];
const uint8_t (*lut_bnPosBnProcBuf_CNG3) [lut_numCnInCnGroups[0]] = (uint8_t(*)[lut_numCnInCnGroups[0]]) p_lut->bnPosBnProcBuf[0];
const uint8_t (*lut_bnPosBnProcBuf_CNG4) [lut_numCnInCnGroups[1]] = (uint8_t(*)[lut_numCnInCnGroups[1]]) p_lut->bnPosBnProcBuf[1];
const uint8_t (*lut_bnPosBnProcBuf_CNG5) [lut_numCnInCnGroups[2]] = (uint8_t(*)[lut_numCnInCnGroups[2]]) p_lut->bnPosBnProcBuf[2];
const uint8_t (*lut_bnPosBnProcBuf_CNG6) [lut_numCnInCnGroups[3]] = (uint8_t(*)[lut_numCnInCnGroups[3]]) p_lut->bnPosBnProcBuf[3];
const uint8_t (*lut_bnPosBnProcBuf_CNG8) [lut_numCnInCnGroups[4]] = (uint8_t(*)[lut_numCnInCnGroups[4]]) p_lut->bnPosBnProcBuf[4];
const uint8_t (*lut_bnPosBnProcBuf_CNG10) [lut_numCnInCnGroups[5]] = (uint8_t(*)[lut_numCnInCnGroups[5]]) p_lut->bnPosBnProcBuf[5];
int8_t* cnProcBufRes = p_procBuf->cnProcBufRes;
int8_t* bnProcBuf = p_procBuf->bnProcBuf;
int8_t* p_cnProcBufRes;
uint32_t bitOffsetInGroup;
uint32_t i;
uint32_t j;
uint32_t idxBn = 0;
// =====================================================================
// CN group with 3 BNs
bitOffsetInGroup = lut_numCnInCnGroups_BG2_R15[0]*NR_LDPC_ZMAX;
for (j=0; j<3; j++)
{
p_cnProcBufRes = &cnProcBufRes[lut_startAddrCnGroups[0] + j*bitOffsetInGroup];
for (i=0; i<lut_numCnInCnGroups[0]; i++)
{
idxBn = lut_startAddrBnProcBuf_CNG3[j][i] + lut_bnPosBnProcBuf_CNG3[j][i]*Z;
nrLDPC_inv_circ_memcpy(&bnProcBuf[idxBn],p_cnProcBufRes,Z,lut_circShift_CNG3[j][i]);
p_cnProcBufRes += Z;
}
}
// =====================================================================
// CN group with 4 BNs
bitOffsetInGroup = lut_numCnInCnGroups_BG2_R15[1]*NR_LDPC_ZMAX;
for (j=0; j<4; j++)
{
p_cnProcBufRes = &cnProcBufRes[lut_startAddrCnGroups[1] + j*bitOffsetInGroup];
for (i=0; i<lut_numCnInCnGroups[1]; i++)
{
idxBn = lut_startAddrBnProcBuf_CNG4[j][i] + lut_bnPosBnProcBuf_CNG4[j][i]*Z;
nrLDPC_inv_circ_memcpy(&bnProcBuf[idxBn],p_cnProcBufRes,Z,lut_circShift_CNG4[j][i]);
p_cnProcBufRes += Z;
}
}
// =====================================================================
// CN group with 5 BNs
bitOffsetInGroup = lut_numCnInCnGroups_BG2_R15[2]*NR_LDPC_ZMAX;
for (j=0; j<5; j++)
{
p_cnProcBufRes = &cnProcBufRes[lut_startAddrCnGroups[2] + j*bitOffsetInGroup];
for (i=0; i<lut_numCnInCnGroups[2]; i++)
{
idxBn = lut_startAddrBnProcBuf_CNG5[j][i] + lut_bnPosBnProcBuf_CNG5[j][i]*Z;
nrLDPC_inv_circ_memcpy(&bnProcBuf[idxBn],p_cnProcBufRes,Z,lut_circShift_CNG5[j][i]);
p_cnProcBufRes += Z;
}
}
// =====================================================================
// CN group with 6 BNs
bitOffsetInGroup = lut_numCnInCnGroups_BG2_R15[3]*NR_LDPC_ZMAX;
for (j=0; j<6; j++)
{
p_cnProcBufRes = &cnProcBufRes[lut_startAddrCnGroups[3] + j*bitOffsetInGroup];
for (i=0; i<lut_numCnInCnGroups[3]; i++)
{
idxBn = lut_startAddrBnProcBuf_CNG6[j][i] + lut_bnPosBnProcBuf_CNG6[j][i]*Z;
nrLDPC_inv_circ_memcpy(&bnProcBuf[idxBn],p_cnProcBufRes,Z,lut_circShift_CNG6[j][i]);
p_cnProcBufRes += Z;
}
}
// =====================================================================
// CN group with 8 BNs
bitOffsetInGroup = lut_numCnInCnGroups_BG2_R15[4]*NR_LDPC_ZMAX;
for (j=0; j<8; j++)
{
p_cnProcBufRes = &cnProcBufRes[lut_startAddrCnGroups[4] + j*bitOffsetInGroup];
for (i=0; i<lut_numCnInCnGroups[4]; i++)
{
idxBn = lut_startAddrBnProcBuf_CNG8[j][i] + lut_bnPosBnProcBuf_CNG8[j][i]*Z;
nrLDPC_inv_circ_memcpy(&bnProcBuf[idxBn],p_cnProcBufRes,Z,lut_circShift_CNG8[j][i]);
p_cnProcBufRes += Z;
}
}
// =====================================================================
// CN group with 10 BNs
bitOffsetInGroup = lut_numCnInCnGroups_BG2_R15[5]*NR_LDPC_ZMAX;
for (j=0; j<10; j++)
{
p_cnProcBufRes = &cnProcBufRes[lut_startAddrCnGroups[5] + j*bitOffsetInGroup];
for (i=0; i<lut_numCnInCnGroups[5]; i++)
{
idxBn = lut_startAddrBnProcBuf_CNG10[j][i] + lut_bnPosBnProcBuf_CNG10[j][i]*Z;
nrLDPC_inv_circ_memcpy(&bnProcBuf[idxBn],p_cnProcBufRes,Z,lut_circShift_CNG10[j][i]);
p_cnProcBufRes += Z;
}
}
}
/**
\brief Copies the values in the CN processing results buffer to their corresponding place in the BN processing buffer for BG1.
\param p_lut Pointer to decoder LUTs
\param p_procBuf Pointer to the processing buffers
\param Z Lifting size
*/
static inline void nrLDPC_cn2bnProcBuf_BG1(t_nrLDPC_lut* p_lut, t_nrLDPC_procBuf* p_procBuf, uint16_t Z)
{
const uint8_t* lut_numCnInCnGroups = p_lut->numCnInCnGroups;
const uint32_t* lut_startAddrCnGroups = p_lut->startAddrCnGroups;
const uint16_t (*lut_circShift_CNG3) [lut_numCnInCnGroups_BG1_R13[0]] = (uint16_t(*)[lut_numCnInCnGroups_BG1_R13[0]]) p_lut->circShift[0];
const uint16_t (*lut_circShift_CNG4) [lut_numCnInCnGroups_BG1_R13[1]] = (uint16_t(*)[lut_numCnInCnGroups_BG1_R13[1]]) p_lut->circShift[1];
const uint16_t (*lut_circShift_CNG5) [lut_numCnInCnGroups_BG1_R13[2]] = (uint16_t(*)[lut_numCnInCnGroups_BG1_R13[2]]) p_lut->circShift[2];
const uint16_t (*lut_circShift_CNG6) [lut_numCnInCnGroups_BG1_R13[3]] = (uint16_t(*)[lut_numCnInCnGroups_BG1_R13[3]]) p_lut->circShift[3];
const uint16_t (*lut_circShift_CNG7) [lut_numCnInCnGroups_BG1_R13[4]] = (uint16_t(*)[lut_numCnInCnGroups_BG1_R13[4]]) p_lut->circShift[4];
const uint16_t (*lut_circShift_CNG8) [lut_numCnInCnGroups_BG1_R13[5]] = (uint16_t(*)[lut_numCnInCnGroups_BG1_R13[5]]) p_lut->circShift[5];
const uint16_t (*lut_circShift_CNG9) [lut_numCnInCnGroups_BG1_R13[6]] = (uint16_t(*)[lut_numCnInCnGroups_BG1_R13[6]]) p_lut->circShift[6];
const uint16_t (*lut_circShift_CNG10)[lut_numCnInCnGroups_BG1_R13[7]] = (uint16_t(*)[lut_numCnInCnGroups_BG1_R13[7]]) p_lut->circShift[7];
const uint16_t (*lut_circShift_CNG19)[lut_numCnInCnGroups_BG1_R13[8]] = (uint16_t(*)[lut_numCnInCnGroups_BG1_R13[8]]) p_lut->circShift[8];
const uint32_t (*lut_startAddrBnProcBuf_CNG3) [lut_numCnInCnGroups[0]] = (uint32_t(*)[lut_numCnInCnGroups[0]]) p_lut->startAddrBnProcBuf[0];
const uint32_t (*lut_startAddrBnProcBuf_CNG4) [lut_numCnInCnGroups[1]] = (uint32_t(*)[lut_numCnInCnGroups[1]]) p_lut->startAddrBnProcBuf[1];
const uint32_t (*lut_startAddrBnProcBuf_CNG5) [lut_numCnInCnGroups[2]] = (uint32_t(*)[lut_numCnInCnGroups[2]]) p_lut->startAddrBnProcBuf[2];
const uint32_t (*lut_startAddrBnProcBuf_CNG6) [lut_numCnInCnGroups[3]] = (uint32_t(*)[lut_numCnInCnGroups[3]]) p_lut->startAddrBnProcBuf[3];
const uint32_t (*lut_startAddrBnProcBuf_CNG7) [lut_numCnInCnGroups[4]] = (uint32_t(*)[lut_numCnInCnGroups[4]]) p_lut->startAddrBnProcBuf[4];
const uint32_t (*lut_startAddrBnProcBuf_CNG8) [lut_numCnInCnGroups[5]] = (uint32_t(*)[lut_numCnInCnGroups[5]]) p_lut->startAddrBnProcBuf[5];
const uint32_t (*lut_startAddrBnProcBuf_CNG9) [lut_numCnInCnGroups[6]] = (uint32_t(*)[lut_numCnInCnGroups[6]]) p_lut->startAddrBnProcBuf[6];
const uint32_t (*lut_startAddrBnProcBuf_CNG10)[lut_numCnInCnGroups[7]] = (uint32_t(*)[lut_numCnInCnGroups[7]]) p_lut->startAddrBnProcBuf[7];
const uint32_t (*lut_startAddrBnProcBuf_CNG19)[lut_numCnInCnGroups[8]] = (uint32_t(*)[lut_numCnInCnGroups[8]]) p_lut->startAddrBnProcBuf[8];
const uint8_t (*lut_bnPosBnProcBuf_CNG4) [lut_numCnInCnGroups[1]] = (uint8_t(*)[lut_numCnInCnGroups[1]]) p_lut->bnPosBnProcBuf[1];
const uint8_t (*lut_bnPosBnProcBuf_CNG5) [lut_numCnInCnGroups[2]] = (uint8_t(*)[lut_numCnInCnGroups[2]]) p_lut->bnPosBnProcBuf[2];
const uint8_t (*lut_bnPosBnProcBuf_CNG6) [lut_numCnInCnGroups[3]] = (uint8_t(*)[lut_numCnInCnGroups[3]]) p_lut->bnPosBnProcBuf[3];
const uint8_t (*lut_bnPosBnProcBuf_CNG7) [lut_numCnInCnGroups[4]] = (uint8_t(*)[lut_numCnInCnGroups[4]]) p_lut->bnPosBnProcBuf[4];
const uint8_t (*lut_bnPosBnProcBuf_CNG8) [lut_numCnInCnGroups[5]] = (uint8_t(*)[lut_numCnInCnGroups[5]]) p_lut->bnPosBnProcBuf[5];
const uint8_t (*lut_bnPosBnProcBuf_CNG9) [lut_numCnInCnGroups[6]] = (uint8_t(*)[lut_numCnInCnGroups[6]]) p_lut->bnPosBnProcBuf[6];
const uint8_t (*lut_bnPosBnProcBuf_CNG10)[lut_numCnInCnGroups[7]] = (uint8_t(*)[lut_numCnInCnGroups[7]]) p_lut->bnPosBnProcBuf[7];
const uint8_t (*lut_bnPosBnProcBuf_CNG19)[lut_numCnInCnGroups[8]] = (uint8_t(*)[lut_numCnInCnGroups[8]]) p_lut->bnPosBnProcBuf[8];
int8_t* cnProcBufRes = p_procBuf->cnProcBufRes;
int8_t* bnProcBuf = p_procBuf->bnProcBuf;
int8_t* p_cnProcBufRes;
uint32_t bitOffsetInGroup;
uint32_t i;
uint32_t j;
uint32_t idxBn = 0;
// =====================================================================
// CN group with 3 BNs
bitOffsetInGroup = lut_numCnInCnGroups_BG1_R13[0]*NR_LDPC_ZMAX;
for (j=0; j<3; j++)
{
p_cnProcBufRes = &cnProcBufRes[lut_startAddrCnGroups[0] + j*bitOffsetInGroup];
nrLDPC_inv_circ_memcpy(&bnProcBuf[lut_startAddrBnProcBuf_CNG3[j][0]],p_cnProcBufRes,Z,lut_circShift_CNG3[j][0]);
}
// =====================================================================
// CN group with 4 BNs
bitOffsetInGroup = lut_numCnInCnGroups_BG1_R13[1]*NR_LDPC_ZMAX;
for (j=0; j<4; j++)
{
p_cnProcBufRes = &cnProcBufRes[lut_startAddrCnGroups[1] + j*bitOffsetInGroup];
for (i=0; i<lut_numCnInCnGroups[1]; i++)
{
idxBn = lut_startAddrBnProcBuf_CNG4[j][i] + lut_bnPosBnProcBuf_CNG4[j][i]*Z;
nrLDPC_inv_circ_memcpy(&bnProcBuf[idxBn],p_cnProcBufRes,Z,lut_circShift_CNG4[j][i]);
p_cnProcBufRes += Z;
}
}
// =====================================================================
// CN group with 5 BNs
bitOffsetInGroup = lut_numCnInCnGroups_BG1_R13[2]*NR_LDPC_ZMAX;
for (j=0; j<5; j++)
{
p_cnProcBufRes = &cnProcBufRes[lut_startAddrCnGroups[2] + j*bitOffsetInGroup];
for (i=0; i<lut_numCnInCnGroups[2]; i++)
{
idxBn = lut_startAddrBnProcBuf_CNG5[j][i] + lut_bnPosBnProcBuf_CNG5[j][i]*Z;
nrLDPC_inv_circ_memcpy(&bnProcBuf[idxBn],p_cnProcBufRes,Z,lut_circShift_CNG5[j][i]);
p_cnProcBufRes += Z;
}
}
// =====================================================================
// CN group with 6 BNs
bitOffsetInGroup = lut_numCnInCnGroups_BG1_R13[3]*NR_LDPC_ZMAX;
for (j=0; j<6; j++)
{
p_cnProcBufRes = &cnProcBufRes[lut_startAddrCnGroups[3] + j*bitOffsetInGroup];
for (i=0; i<lut_numCnInCnGroups[3]; i++)
{
idxBn = lut_startAddrBnProcBuf_CNG6[j][i] + lut_bnPosBnProcBuf_CNG6[j][i]*Z;
nrLDPC_inv_circ_memcpy(&bnProcBuf[idxBn],p_cnProcBufRes,Z,lut_circShift_CNG6[j][i]);
p_cnProcBufRes += Z;
}
}
// =====================================================================
// CN group with 7 BNs
bitOffsetInGroup = lut_numCnInCnGroups_BG1_R13[4]*NR_LDPC_ZMAX;
for (j=0; j<7; j++)
{
p_cnProcBufRes = &cnProcBufRes[lut_startAddrCnGroups[4] + j*bitOffsetInGroup];
for (i=0; i<lut_numCnInCnGroups[4]; i++)
{
idxBn = lut_startAddrBnProcBuf_CNG7[j][i] + lut_bnPosBnProcBuf_CNG7[j][i]*Z;
nrLDPC_inv_circ_memcpy(&bnProcBuf[idxBn],p_cnProcBufRes,Z,lut_circShift_CNG7[j][i]);
p_cnProcBufRes += Z;
}
}
// =====================================================================
// CN group with 8 BNs
bitOffsetInGroup = lut_numCnInCnGroups_BG1_R13[5]*NR_LDPC_ZMAX;
for (j=0; j<8; j++)
{
p_cnProcBufRes = &cnProcBufRes[lut_startAddrCnGroups[5] + j*bitOffsetInGroup];
for (i=0; i<lut_numCnInCnGroups[5]; i++)
{
idxBn = lut_startAddrBnProcBuf_CNG8[j][i] + lut_bnPosBnProcBuf_CNG8[j][i]*Z;
nrLDPC_inv_circ_memcpy(&bnProcBuf[idxBn],p_cnProcBufRes,Z,lut_circShift_CNG8[j][i]);
p_cnProcBufRes += Z;
}
}
// =====================================================================
// CN group with 9 BNs
bitOffsetInGroup = lut_numCnInCnGroups_BG1_R13[6]*NR_LDPC_ZMAX;
for (j=0; j<9; j++)
{
p_cnProcBufRes = &cnProcBufRes[lut_startAddrCnGroups[6] + j*bitOffsetInGroup];
for (i=0; i<lut_numCnInCnGroups[6]; i++)
{
idxBn = lut_startAddrBnProcBuf_CNG9[j][i] + lut_bnPosBnProcBuf_CNG9[j][i]*Z;
nrLDPC_inv_circ_memcpy(&bnProcBuf[idxBn],p_cnProcBufRes,Z,lut_circShift_CNG9[j][i]);
p_cnProcBufRes += Z;
}
}
// =====================================================================
// CN group with 10 BNs
bitOffsetInGroup = lut_numCnInCnGroups_BG1_R13[7]*NR_LDPC_ZMAX;
for (j=0; j<10; j++)
{
p_cnProcBufRes = &cnProcBufRes[lut_startAddrCnGroups[7] + j*bitOffsetInGroup];
for (i=0; i<lut_numCnInCnGroups[7]; i++)
{
idxBn = lut_startAddrBnProcBuf_CNG10[j][i] + lut_bnPosBnProcBuf_CNG10[j][i]*Z;
nrLDPC_inv_circ_memcpy(&bnProcBuf[idxBn],p_cnProcBufRes,Z,lut_circShift_CNG10[j][i]);
p_cnProcBufRes += Z;
}
}
// =====================================================================
// CN group with 19 BNs
bitOffsetInGroup = lut_numCnInCnGroups_BG1_R13[8]*NR_LDPC_ZMAX;
for (j=0; j<19; j++)
{
p_cnProcBufRes = &cnProcBufRes[lut_startAddrCnGroups[8] + j*bitOffsetInGroup];
for (i=0; i<lut_numCnInCnGroups[8]; i++)
{
idxBn = lut_startAddrBnProcBuf_CNG19[j][i] + lut_bnPosBnProcBuf_CNG19[j][i]*Z;
nrLDPC_inv_circ_memcpy(&bnProcBuf[idxBn],p_cnProcBufRes,Z,lut_circShift_CNG19[j][i]);
p_cnProcBufRes += Z;
}
}
}
/**
\brief Copies the values in the BN processing results buffer to their corresponding place in the CN processing buffer for BG2.
\param p_lut Pointer to decoder LUTs
\param p_procBuf Pointer to the processing buffers
\param Z Lifting size
*/
static inline void nrLDPC_bn2cnProcBuf_BG2(t_nrLDPC_lut* p_lut, t_nrLDPC_procBuf* p_procBuf, uint16_t Z)
{
const uint8_t* lut_numCnInCnGroups = p_lut->numCnInCnGroups;
const uint32_t* lut_startAddrCnGroups = p_lut->startAddrCnGroups;
const uint16_t (*lut_circShift_CNG3) [lut_numCnInCnGroups_BG2_R15[0]] = (uint16_t(*)[lut_numCnInCnGroups_BG2_R15[0]]) p_lut->circShift[0];
const uint16_t (*lut_circShift_CNG4) [lut_numCnInCnGroups_BG2_R15[1]] = (uint16_t(*)[lut_numCnInCnGroups_BG2_R15[1]]) p_lut->circShift[1];
const uint16_t (*lut_circShift_CNG5) [lut_numCnInCnGroups_BG2_R15[2]] = (uint16_t(*)[lut_numCnInCnGroups_BG2_R15[2]]) p_lut->circShift[2];
const uint16_t (*lut_circShift_CNG6) [lut_numCnInCnGroups_BG2_R15[3]] = (uint16_t(*)[lut_numCnInCnGroups_BG2_R15[3]]) p_lut->circShift[3];
const uint16_t (*lut_circShift_CNG8) [lut_numCnInCnGroups_BG2_R15[4]] = (uint16_t(*)[lut_numCnInCnGroups_BG2_R15[4]]) p_lut->circShift[4];
const uint16_t (*lut_circShift_CNG10) [lut_numCnInCnGroups_BG2_R15[5]] = (uint16_t(*)[lut_numCnInCnGroups_BG2_R15[5]]) p_lut->circShift[5];
const uint32_t (*lut_startAddrBnProcBuf_CNG3) [lut_numCnInCnGroups[0]] = (uint32_t(*)[lut_numCnInCnGroups[0]]) p_lut->startAddrBnProcBuf[0];
const uint32_t (*lut_startAddrBnProcBuf_CNG4) [lut_numCnInCnGroups[1]] = (uint32_t(*)[lut_numCnInCnGroups[1]]) p_lut->startAddrBnProcBuf[1];
const uint32_t (*lut_startAddrBnProcBuf_CNG5) [lut_numCnInCnGroups[2]] = (uint32_t(*)[lut_numCnInCnGroups[2]]) p_lut->startAddrBnProcBuf[2];
const uint32_t (*lut_startAddrBnProcBuf_CNG6) [lut_numCnInCnGroups[3]] = (uint32_t(*)[lut_numCnInCnGroups[3]]) p_lut->startAddrBnProcBuf[3];
const uint32_t (*lut_startAddrBnProcBuf_CNG8) [lut_numCnInCnGroups[4]] = (uint32_t(*)[lut_numCnInCnGroups[4]]) p_lut->startAddrBnProcBuf[4];
const uint32_t (*lut_startAddrBnProcBuf_CNG10) [lut_numCnInCnGroups[5]] = (uint32_t(*)[lut_numCnInCnGroups[5]]) p_lut->startAddrBnProcBuf[5];
const uint8_t (*lut_bnPosBnProcBuf_CNG3) [lut_numCnInCnGroups[0]] = (uint8_t(*)[lut_numCnInCnGroups[0]]) p_lut->bnPosBnProcBuf[0];
const uint8_t (*lut_bnPosBnProcBuf_CNG4) [lut_numCnInCnGroups[1]] = (uint8_t(*)[lut_numCnInCnGroups[1]]) p_lut->bnPosBnProcBuf[1];
const uint8_t (*lut_bnPosBnProcBuf_CNG5) [lut_numCnInCnGroups[2]] = (uint8_t(*)[lut_numCnInCnGroups[2]]) p_lut->bnPosBnProcBuf[2];
const uint8_t (*lut_bnPosBnProcBuf_CNG6) [lut_numCnInCnGroups[3]] = (uint8_t(*)[lut_numCnInCnGroups[3]]) p_lut->bnPosBnProcBuf[3];
const uint8_t (*lut_bnPosBnProcBuf_CNG8) [lut_numCnInCnGroups[4]] = (uint8_t(*)[lut_numCnInCnGroups[4]]) p_lut->bnPosBnProcBuf[4];
const uint8_t (*lut_bnPosBnProcBuf_CNG10) [lut_numCnInCnGroups[5]] = (uint8_t(*)[lut_numCnInCnGroups[5]]) p_lut->bnPosBnProcBuf[5];
int8_t* cnProcBuf = p_procBuf->cnProcBuf;
int8_t* bnProcBufRes = p_procBuf->bnProcBufRes;
int8_t* p_cnProcBuf;
uint32_t bitOffsetInGroup;
uint32_t i;
uint32_t j;
uint32_t idxBn = 0;
// For CN groups 3 to 6 no need to send the last BN back since it's single edge
// and BN processing does not change the value already in the CN proc buf
// =====================================================================
// CN group with 3 BNs
bitOffsetInGroup = lut_numCnInCnGroups_BG2_R15[0]*NR_LDPC_ZMAX;
for (j=0; j<2; j++)
{
p_cnProcBuf = &cnProcBuf[lut_startAddrCnGroups[0] + j*bitOffsetInGroup];
for (i=0; i<lut_numCnInCnGroups[0]; i++)
{
idxBn = lut_startAddrBnProcBuf_CNG3[j][i] + lut_bnPosBnProcBuf_CNG3[j][i]*Z;
nrLDPC_circ_memcpy(p_cnProcBuf, &bnProcBufRes[idxBn], Z, lut_circShift_CNG3[j][i]);
p_cnProcBuf += Z;
}
}
// =====================================================================
// CN group with 4 BNs
bitOffsetInGroup = lut_numCnInCnGroups_BG2_R15[1]*NR_LDPC_ZMAX;
for (j=0; j<3; j++)
{
p_cnProcBuf = &cnProcBuf[lut_startAddrCnGroups[1] + j*bitOffsetInGroup];
for (i=0; i<lut_numCnInCnGroups[1]; i++)
{
idxBn = lut_startAddrBnProcBuf_CNG4[j][i] + lut_bnPosBnProcBuf_CNG4[j][i]*Z;
nrLDPC_circ_memcpy(p_cnProcBuf, &bnProcBufRes[idxBn], Z, lut_circShift_CNG4[j][i]);
p_cnProcBuf += Z;
}
}
// =====================================================================
// CN group with 5 BNs
bitOffsetInGroup = lut_numCnInCnGroups_BG2_R15[2]*NR_LDPC_ZMAX;
for (j=0; j<4; j++)
{
p_cnProcBuf = &cnProcBuf[lut_startAddrCnGroups[2] + j*bitOffsetInGroup];
for (i=0; i<lut_numCnInCnGroups[2]; i++)
{
idxBn = lut_startAddrBnProcBuf_CNG5[j][i] + lut_bnPosBnProcBuf_CNG5[j][i]*Z;
nrLDPC_circ_memcpy(p_cnProcBuf, &bnProcBufRes[idxBn], Z, lut_circShift_CNG5[j][i]);
p_cnProcBuf += Z;
}
}
// =====================================================================
// CN group with 6 BNs
bitOffsetInGroup = lut_numCnInCnGroups_BG2_R15[3]*NR_LDPC_ZMAX;
for (j=0; j<5; j++)
{
p_cnProcBuf = &cnProcBuf[lut_startAddrCnGroups[3] + j*bitOffsetInGroup];
for (i=0; i<lut_numCnInCnGroups[3]; i++)
{
idxBn = lut_startAddrBnProcBuf_CNG6[j][i] + lut_bnPosBnProcBuf_CNG6[j][i]*Z;
nrLDPC_circ_memcpy(p_cnProcBuf, &bnProcBufRes[idxBn], Z, lut_circShift_CNG6[j][i]);
p_cnProcBuf += Z;
}
}
// =====================================================================
// CN group with 8 BNs
bitOffsetInGroup = lut_numCnInCnGroups_BG2_R15[4]*NR_LDPC_ZMAX;
for (j=0; j<8; j++)
{
p_cnProcBuf = &cnProcBuf[lut_startAddrCnGroups[4] + j*bitOffsetInGroup];
for (i=0; i<lut_numCnInCnGroups[4]; i++)
{
idxBn = lut_startAddrBnProcBuf_CNG8[j][i] + lut_bnPosBnProcBuf_CNG8[j][i]*Z;
nrLDPC_circ_memcpy(p_cnProcBuf, &bnProcBufRes[idxBn], Z, lut_circShift_CNG8[j][i]);
p_cnProcBuf += Z;
}
}
// =====================================================================
// CN group with 10 BNs
bitOffsetInGroup = lut_numCnInCnGroups_BG2_R15[5]*NR_LDPC_ZMAX;
for (j=0; j<10; j++)
{
p_cnProcBuf = &cnProcBuf[lut_startAddrCnGroups[5] + j*bitOffsetInGroup];
for (i=0; i<lut_numCnInCnGroups[5]; i++)
{
idxBn = lut_startAddrBnProcBuf_CNG10[j][i] + lut_bnPosBnProcBuf_CNG10[j][i]*Z;
nrLDPC_circ_memcpy(p_cnProcBuf, &bnProcBufRes[idxBn], Z, lut_circShift_CNG10[j][i]);
p_cnProcBuf += Z;
}
}
}
/**
\brief Copies the values in the BN processing results buffer to their corresponding place in the CN processing buffer for BG1.
\param p_lut Pointer to decoder LUTs
\param p_procBuf Pointer to the processing buffers
\param Z Lifting size
*/
static inline void nrLDPC_bn2cnProcBuf_BG1(t_nrLDPC_lut* p_lut, t_nrLDPC_procBuf* p_procBuf, uint16_t Z)
{
const uint8_t* lut_numCnInCnGroups = p_lut->numCnInCnGroups;
const uint32_t* lut_startAddrCnGroups = p_lut->startAddrCnGroups;
const uint16_t (*lut_circShift_CNG3) [lut_numCnInCnGroups_BG1_R13[0]] = (uint16_t(*)[lut_numCnInCnGroups_BG1_R13[0]]) p_lut->circShift[0];
const uint16_t (*lut_circShift_CNG4) [lut_numCnInCnGroups_BG1_R13[1]] = (uint16_t(*)[lut_numCnInCnGroups_BG1_R13[1]]) p_lut->circShift[1];
const uint16_t (*lut_circShift_CNG5) [lut_numCnInCnGroups_BG1_R13[2]] = (uint16_t(*)[lut_numCnInCnGroups_BG1_R13[2]]) p_lut->circShift[2];
const uint16_t (*lut_circShift_CNG6) [lut_numCnInCnGroups_BG1_R13[3]] = (uint16_t(*)[lut_numCnInCnGroups_BG1_R13[3]]) p_lut->circShift[3];
const uint16_t (*lut_circShift_CNG7) [lut_numCnInCnGroups_BG1_R13[4]] = (uint16_t(*)[lut_numCnInCnGroups_BG1_R13[4]]) p_lut->circShift[4];
const uint16_t (*lut_circShift_CNG8) [lut_numCnInCnGroups_BG1_R13[5]] = (uint16_t(*)[lut_numCnInCnGroups_BG1_R13[5]]) p_lut->circShift[5];
const uint16_t (*lut_circShift_CNG9) [lut_numCnInCnGroups_BG1_R13[6]] = (uint16_t(*)[lut_numCnInCnGroups_BG1_R13[6]]) p_lut->circShift[6];
const uint16_t (*lut_circShift_CNG10)[lut_numCnInCnGroups_BG1_R13[7]] = (uint16_t(*)[lut_numCnInCnGroups_BG1_R13[7]]) p_lut->circShift[7];
const uint16_t (*lut_circShift_CNG19)[lut_numCnInCnGroups_BG1_R13[8]] = (uint16_t(*)[lut_numCnInCnGroups_BG1_R13[8]]) p_lut->circShift[8];
const uint32_t (*lut_startAddrBnProcBuf_CNG3) [lut_numCnInCnGroups[0]] = (uint32_t(*)[lut_numCnInCnGroups[0]]) p_lut->startAddrBnProcBuf[0];
const uint32_t (*lut_startAddrBnProcBuf_CNG4) [lut_numCnInCnGroups[1]] = (uint32_t(*)[lut_numCnInCnGroups[1]]) p_lut->startAddrBnProcBuf[1];
const uint32_t (*lut_startAddrBnProcBuf_CNG5) [lut_numCnInCnGroups[2]] = (uint32_t(*)[lut_numCnInCnGroups[2]]) p_lut->startAddrBnProcBuf[2];
const uint32_t (*lut_startAddrBnProcBuf_CNG6) [lut_numCnInCnGroups[3]] = (uint32_t(*)[lut_numCnInCnGroups[3]]) p_lut->startAddrBnProcBuf[3];
const uint32_t (*lut_startAddrBnProcBuf_CNG7) [lut_numCnInCnGroups[4]] = (uint32_t(*)[lut_numCnInCnGroups[4]]) p_lut->startAddrBnProcBuf[4];
const uint32_t (*lut_startAddrBnProcBuf_CNG8) [lut_numCnInCnGroups[5]] = (uint32_t(*)[lut_numCnInCnGroups[5]]) p_lut->startAddrBnProcBuf[5];
const uint32_t (*lut_startAddrBnProcBuf_CNG9) [lut_numCnInCnGroups[6]] = (uint32_t(*)[lut_numCnInCnGroups[6]]) p_lut->startAddrBnProcBuf[6];
const uint32_t (*lut_startAddrBnProcBuf_CNG10)[lut_numCnInCnGroups[7]] = (uint32_t(*)[lut_numCnInCnGroups[7]]) p_lut->startAddrBnProcBuf[7];
const uint32_t (*lut_startAddrBnProcBuf_CNG19)[lut_numCnInCnGroups[8]] = (uint32_t(*)[lut_numCnInCnGroups[8]]) p_lut->startAddrBnProcBuf[8];
const uint8_t (*lut_bnPosBnProcBuf_CNG4) [lut_numCnInCnGroups[1]] = (uint8_t(*)[lut_numCnInCnGroups[1]]) p_lut->bnPosBnProcBuf[1];
const uint8_t (*lut_bnPosBnProcBuf_CNG5) [lut_numCnInCnGroups[2]] = (uint8_t(*)[lut_numCnInCnGroups[2]]) p_lut->bnPosBnProcBuf[2];
const uint8_t (*lut_bnPosBnProcBuf_CNG6) [lut_numCnInCnGroups[3]] = (uint8_t(*)[lut_numCnInCnGroups[3]]) p_lut->bnPosBnProcBuf[3];
const uint8_t (*lut_bnPosBnProcBuf_CNG7) [lut_numCnInCnGroups[4]] = (uint8_t(*)[lut_numCnInCnGroups[4]]) p_lut->bnPosBnProcBuf[4];
const uint8_t (*lut_bnPosBnProcBuf_CNG8) [lut_numCnInCnGroups[5]] = (uint8_t(*)[lut_numCnInCnGroups[5]]) p_lut->bnPosBnProcBuf[5];
const uint8_t (*lut_bnPosBnProcBuf_CNG9) [lut_numCnInCnGroups[6]] = (uint8_t(*)[lut_numCnInCnGroups[6]]) p_lut->bnPosBnProcBuf[6];
const uint8_t (*lut_bnPosBnProcBuf_CNG10)[lut_numCnInCnGroups[7]] = (uint8_t(*)[lut_numCnInCnGroups[7]]) p_lut->bnPosBnProcBuf[7];
const uint8_t (*lut_bnPosBnProcBuf_CNG19)[lut_numCnInCnGroups[8]] = (uint8_t(*)[lut_numCnInCnGroups[8]]) p_lut->bnPosBnProcBuf[8];
int8_t* cnProcBuf = p_procBuf->cnProcBuf;
int8_t* bnProcBufRes = p_procBuf->bnProcBufRes;
int8_t* p_cnProcBuf;
uint32_t bitOffsetInGroup;
uint32_t i;
uint32_t j;
uint32_t idxBn = 0;
// For CN groups 3 to 19 no need to send the last BN back since it's single edge
// and BN processing does not change the value already in the CN proc buf
// =====================================================================
// CN group with 3 BNs
bitOffsetInGroup = lut_numCnInCnGroups_BG1_R13[0]*NR_LDPC_ZMAX;
// #pragma omp simd
for (j=0;j<2; j++)
{
p_cnProcBuf = &cnProcBuf[lut_startAddrCnGroups[0] + j*bitOffsetInGroup];
nrLDPC_circ_memcpy(p_cnProcBuf, &bnProcBufRes[lut_startAddrBnProcBuf_CNG3[j][0]], Z, lut_circShift_CNG3[j][0]);
}
// =====================================================================
// CN group with 4 BNs
bitOffsetInGroup = lut_numCnInCnGroups_BG1_R13[1]*NR_LDPC_ZMAX;
// #pragma omp simd
for (j=0; j<3; j++)
{
p_cnProcBuf = &cnProcBuf[lut_startAddrCnGroups[1] + j*bitOffsetInGroup];
// #pragma omp simd
for (i=0; i<lut_numCnInCnGroups[1]; i++)
{
idxBn = lut_startAddrBnProcBuf_CNG4[j][i] + lut_bnPosBnProcBuf_CNG4[j][i]*Z;
nrLDPC_circ_memcpy(p_cnProcBuf, &bnProcBufRes[idxBn], Z, lut_circShift_CNG4[j][i]);
p_cnProcBuf += Z;
}
}
// =====================================================================
// CN group with 5 BNs
bitOffsetInGroup = lut_numCnInCnGroups_BG1_R13[2]*NR_LDPC_ZMAX;
// #pragma omp simd
for (j=0; j<4; j++)
{
p_cnProcBuf = &cnProcBuf[lut_startAddrCnGroups[2] + j*bitOffsetInGroup];
#pragma omp simd
for (i=0; i<lut_numCnInCnGroups[2]; i++)
{
idxBn = lut_startAddrBnProcBuf_CNG5[j][i] + lut_bnPosBnProcBuf_CNG5[j][i]*Z;
nrLDPC_circ_memcpy(p_cnProcBuf, &bnProcBufRes[idxBn], Z, lut_circShift_CNG5[j][i]);
p_cnProcBuf += Z;
}
}
// =====================================================================
// CN group with 6 BNs
bitOffsetInGroup = lut_numCnInCnGroups_BG1_R13[3]*NR_LDPC_ZMAX;
//#pragma omp simd
for (j=0; j<5; j++)
{
p_cnProcBuf = &cnProcBuf[lut_startAddrCnGroups[3] + j*bitOffsetInGroup];
// #pragma omp simd
for (i=0; i<lut_numCnInCnGroups[3]; i++)
{
idxBn = lut_startAddrBnProcBuf_CNG6[j][i] + lut_bnPosBnProcBuf_CNG6[j][i]*Z;
nrLDPC_circ_memcpy(p_cnProcBuf, &bnProcBufRes[idxBn], Z, lut_circShift_CNG6[j][i]);
p_cnProcBuf += Z;
}
}
// =====================================================================
// CN group with 7 BNs
bitOffsetInGroup = lut_numCnInCnGroups_BG1_R13[4]*NR_LDPC_ZMAX;
//#pragma omp simd
for (j=0; j<6; j++)
{
p_cnProcBuf = &cnProcBuf[lut_startAddrCnGroups[4] + j*bitOffsetInGroup];
// #pragma omp simd
for (i=0; i<lut_numCnInCnGroups[4]; i++)
{
idxBn = lut_startAddrBnProcBuf_CNG7[j][i] + lut_bnPosBnProcBuf_CNG7[j][i]*Z;
nrLDPC_circ_memcpy(p_cnProcBuf, &bnProcBufRes[idxBn], Z, lut_circShift_CNG7[j][i]);
p_cnProcBuf += Z;
}
}
// =====================================================================
// CN group with 8 BNs
bitOffsetInGroup = lut_numCnInCnGroups_BG1_R13[5]*NR_LDPC_ZMAX;
// #pragma omp simd
for (j=0; j<7; j++)
{
p_cnProcBuf = &cnProcBuf[lut_startAddrCnGroups[5] + j*bitOffsetInGroup];
for (i=0; i<lut_numCnInCnGroups[5]; i++)
{
idxBn = lut_startAddrBnProcBuf_CNG8[j][i] + lut_bnPosBnProcBuf_CNG8[j][i]*Z;
nrLDPC_circ_memcpy(p_cnProcBuf, &bnProcBufRes[idxBn], Z, lut_circShift_CNG8[j][i]);
p_cnProcBuf += Z;
}
}
// =====================================================================
// CN group with 9 BNs
bitOffsetInGroup = lut_numCnInCnGroups_BG1_R13[6]*NR_LDPC_ZMAX;
// #pragma omp simd
for (j=0; j<8; j++)
{
p_cnProcBuf = &cnProcBuf[lut_startAddrCnGroups[6] + j*bitOffsetInGroup];
// #pragma omp simd
for (i=0; i<lut_numCnInCnGroups[6]; i++)
{
idxBn = lut_startAddrBnProcBuf_CNG9[j][i] + lut_bnPosBnProcBuf_CNG9[j][i]*Z;
nrLDPC_circ_memcpy(p_cnProcBuf, &bnProcBufRes[idxBn], Z, lut_circShift_CNG9[j][i]);
p_cnProcBuf += Z;
}
}
// =====================================================================
// CN group with 10 BNs
bitOffsetInGroup = lut_numCnInCnGroups_BG1_R13[7]*NR_LDPC_ZMAX;
//#pragma omp simd
for (j=0; j<9; j++)
{
p_cnProcBuf = &cnProcBuf[lut_startAddrCnGroups[7] + j*bitOffsetInGroup];
// #pragma omp simd
for (i=0; i<lut_numCnInCnGroups[7]; i++)
{
idxBn = lut_startAddrBnProcBuf_CNG10[j][i] + lut_bnPosBnProcBuf_CNG10[j][i]*Z;
nrLDPC_circ_memcpy(p_cnProcBuf, &bnProcBufRes[idxBn], Z, lut_circShift_CNG10[j][i]);
p_cnProcBuf += Z;
}
}
// =====================================================================
// CN group with 19 BNs
bitOffsetInGroup = lut_numCnInCnGroups_BG1_R13[8]*NR_LDPC_ZMAX;
//#pragma omp simd
for (j=0; j<19; j++)
{
p_cnProcBuf = &cnProcBuf[lut_startAddrCnGroups[8] + j*bitOffsetInGroup];
// #pragma omp simd
for (i=0; i<lut_numCnInCnGroups[8]; i++)
{
idxBn = lut_startAddrBnProcBuf_CNG19[j][i] + lut_bnPosBnProcBuf_CNG19[j][i]*Z;
nrLDPC_circ_memcpy(p_cnProcBuf, &bnProcBufRes[idxBn], Z, lut_circShift_CNG19[j][i]);
p_cnProcBuf += Z;
}
}
}
/**
\brief Copies the values in the LLR results buffer to their corresponding place in the output LLR vector.
\param p_lut Pointer to decoder LUTs
\param llrOut Pointer to output LLRs
\param p_procBuf Pointer to the processing buffers
\param Z Lifting size
\param BG Base graph
*/
static inline void nrLDPC_llrRes2llrOut(t_nrLDPC_lut* p_lut, int8_t* llrOut, t_nrLDPC_procBuf* p_procBuf, uint16_t Z, uint8_t BG)
{
uint32_t i;
const uint8_t numBn2CnG1 = p_lut->numBnInBnGroups[0];
uint32_t startColParity = (BG ==1 ) ? (NR_LDPC_START_COL_PARITY_BG1) : (NR_LDPC_START_COL_PARITY_BG2);
uint32_t colG1 = startColParity*Z;
const uint16_t* lut_llr2llrProcBufAddr = p_lut->llr2llrProcBufAddr;
const uint8_t* lut_llr2llrProcBufBnPos = p_lut->llr2llrProcBufBnPos;
int8_t* llrRes = p_procBuf->llrRes;
int8_t* p_llrOut = &llrOut[0];
uint32_t idxBn;
// Copy LLRs connected to 1 CN
if (numBn2CnG1 > 0)
{
memcpy(&llrOut[colG1], llrRes, numBn2CnG1*Z);
}
for (i=0; i<startColParity; i++)
{
idxBn = lut_llr2llrProcBufAddr[i] + lut_llr2llrProcBufBnPos[i]*Z;
memcpy(p_llrOut, &llrRes[idxBn], Z);
p_llrOut += Z;
}
}
#endif
......@@ -96,7 +96,7 @@ void nrLDPC_cnProc_BG1_generator_AVX2(const char* dir, int R)
// for (i=0; i<M; i++,iprime++)
// {
fprintf(fd," for (int i=0;i<M;i+=2) {\n");
fprintf(fd," for (int i=0;i<M;i++) {\n");
// Abs and sign of 32 CNs (first BN)
// ymm0 = p_cnProcBuf[lut_idxCnProcG3[j][0] + i];
fprintf(fd," ymm0 = ((__m256i*)cnProcBuf)[%d+i];\n",(lut_startAddrCnGroups[0]>>5)+lut_idxCnProcG3[j][0]);
......
Markdown is supported
0%
or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment