Skip to content
Projects
Groups
Snippets
Help
Loading...
Help
Support
Keyboard shortcuts
?
Submit feedback
Contribute to GitLab
Sign in / Register
Toggle navigation
O
OpenXG-RAN
Project overview
Project overview
Details
Activity
Releases
Repository
Repository
Files
Commits
Branches
Tags
Contributors
Graph
Compare
Issues
0
Issues
0
List
Boards
Labels
Milestones
Merge Requests
1
Merge Requests
1
CI / CD
CI / CD
Pipelines
Jobs
Schedules
Operations
Operations
Metrics
Environments
Analytics
Analytics
CI / CD
Repository
Value Stream
Wiki
Wiki
Snippets
Snippets
Members
Members
Collapse sidebar
Close sidebar
Activity
Graph
Create a new issue
Jobs
Commits
Issue Boards
Open sidebar
OpenXG
OpenXG-RAN
Commits
f6bb869c
Commit
f6bb869c
authored
Jun 12, 2022
by
Raymond Knopp
Browse files
Options
Browse Files
Download
Email Patches
Plain Diff
fixed AVX2 issue for cnProc code generator. Some cleanup in formatting and deleting unused files.
parent
aabd9c6c
Changes
7
Hide whitespace changes
Inline
Side-by-side
Showing
7 changed files
with
1085 additions
and
3148 deletions
+1085
-3148
openair1/PHY/CODING/nrLDPC_decoder.c
openair1/PHY/CODING/nrLDPC_decoder.c
+0
-505
openair1/PHY/CODING/nrLDPC_decoder/nrLDPC_cnProc.h
openair1/PHY/CODING/nrLDPC_decoder/nrLDPC_cnProc.h
+108
-939
openair1/PHY/CODING/nrLDPC_decoder/nrLDPC_cnProc_avx512.h
openair1/PHY/CODING/nrLDPC_decoder/nrLDPC_cnProc_avx512.h
+865
-0
openair1/PHY/CODING/nrLDPC_decoder/nrLDPC_decoder.c
openair1/PHY/CODING/nrLDPC_decoder/nrLDPC_decoder.c
+111
-489
openair1/PHY/CODING/nrLDPC_decoder/nrLDPC_mPass.h
openair1/PHY/CODING/nrLDPC_decoder/nrLDPC_mPass.h
+0
-9
openair1/PHY/CODING/nrLDPC_decoder/nrLDPC_mPass.h_native_memcpy
...r1/PHY/CODING/nrLDPC_decoder/nrLDPC_mPass.h_native_memcpy
+0
-1205
openair1/PHY/CODING/nrLDPC_decoder/nrLDPC_tools/generator_cnProc/cnProc_gen_BG1_avx2.c
...coder/nrLDPC_tools/generator_cnProc/cnProc_gen_BG1_avx2.c
+1
-1
No files found.
openair1/PHY/CODING/nrLDPC_decoder.c
deleted
100644 → 0
View file @
aabd9c6c
/*
* Licensed to the OpenAirInterface (OAI) Software Alliance under one or more
* contributor license agreements. See the NOTICE file distributed with
* this work for additional information regarding copyright ownership.
* The OpenAirInterface Software Alliance licenses this file to You under
* the OAI Public License, Version 1.1 (the "License"); you may not use this file
* except in compliance with the License.
* You may obtain a copy of the License at
*
* http://www.openairinterface.org/?page_id=698
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*-------------------------------------------------------------------------------
* For more information about the OpenAirInterface (OAI) Software Alliance:
* contact@openairinterface.org
*/
/*!\file nrLDPC_decoder.c
* \brief Defines the LDPC decoder
* \author Sebastian Wagner (TCL Communications) Email: <mailto:sebastian.wagner@tcl.com>
* \date 30-09-2019
* \version 2.0
* \note
* \warning
*/
#include <stdint.h>
#include <immintrin.h>
#include "nrLDPCdecoder_defs.h"
#include "nrLDPC_types.h"
#include "nrLDPC_init.h"
#include "nrLDPC_mPass.h"
#include "nrLDPC_cnProc.h"
#include "nrLDPC_bnProc.h"
#define NR_LDPC_ENABLE_PARITY_CHECK
#define NR_LDPC_PROFILER_DETAIL
#ifdef NR_LDPC_DEBUG_MODE
#include "nrLDPC_tools/nrLDPC_debug.h"
#endif
static
inline
uint32_t
nrLDPC_decoder_core
(
int8_t
*
p_llr
,
int8_t
*
p_out
,
t_nrLDPC_procBuf
*
p_procBuf
,
uint32_t
numLLR
,
t_nrLDPC_lut
*
p_lut
,
t_nrLDPC_dec_params
*
p_decParams
,
t_nrLDPC_time_stats
*
p_profiler
);
int32_t
nrLDPC_decod
(
t_nrLDPC_dec_params
*
p_decParams
,
int8_t
*
p_llr
,
int8_t
*
p_out
,
t_nrLDPC_procBuf
*
p_procBuf
,
t_nrLDPC_time_stats
*
p_profiler
)
{
uint32_t
numLLR
;
uint32_t
numIter
=
0
;
t_nrLDPC_lut
lut
;
t_nrLDPC_lut
*
p_lut
=
&
lut
;
//printf("p_procBuf->cnProcBuf = %p\n", p_procBuf->cnProcBuf);
// Initialize decoder core(s) with correct LUTs
numLLR
=
nrLDPC_init
(
p_decParams
,
p_lut
);
// Launch LDPC decoder core for one segment
numIter
=
nrLDPC_decoder_core
(
p_llr
,
p_out
,
p_procBuf
,
numLLR
,
p_lut
,
p_decParams
,
p_profiler
);
return
numIter
;
}
/**
\brief Performs LDPC decoding of one code block
\param p_llr Input LLRs
\param p_out Output vector
\param numLLR Number of LLRs
\param p_lut Pointer to decoder LUTs
\param p_decParams LDPC decoder parameters
\param p_profiler LDPC profiler statistics
*/
static
inline
uint32_t
nrLDPC_decoder_core
(
int8_t
*
p_llr
,
int8_t
*
p_out
,
t_nrLDPC_procBuf
*
p_procBuf
,
uint32_t
numLLR
,
t_nrLDPC_lut
*
p_lut
,
t_nrLDPC_dec_params
*
p_decParams
,
t_nrLDPC_time_stats
*
p_profiler
)
{
uint16_t
Z
=
p_decParams
->
Z
;
uint8_t
BG
=
p_decParams
->
BG
;
uint8_t
numMaxIter
=
p_decParams
->
numMaxIter
;
e_nrLDPC_outMode
outMode
=
p_decParams
->
outMode
;
// Minimum number of iterations is 1
// 0 iterations means hard-decision on input LLRs
uint32_t
i
=
1
;
// Initialize with parity check fail != 0
int32_t
pcRes
=
1
;
int8_t
*
p_llrOut
;
if
(
outMode
==
nrLDPC_outMode_LLRINT8
)
{
p_llrOut
=
p_out
;
}
else
{
// Use LLR processing buffer as temporary output buffer
p_llrOut
=
p_procBuf
->
llrProcBuf
;
// Clear llrProcBuf
memset
(
p_llrOut
,
0
,
NR_LDPC_MAX_NUM_LLR
*
sizeof
(
int8_t
));
}
// Initialization
#ifdef NR_LDPC_PROFILER_DETAIL
start_meas
(
&
p_profiler
->
llr2llrProcBuf
);
#endif
nrLDPC_llr2llrProcBuf
(
p_lut
,
p_llr
,
p_procBuf
,
Z
,
BG
);
#ifdef NR_LDPC_PROFILER_DETAIL
stop_meas
(
&
p_profiler
->
llr2llrProcBuf
);
#endif
#ifdef NR_LDPC_DEBUG_MODE
nrLDPC_debug_initBuffer2File
(
nrLDPC_buffers_LLR_PROC
);
nrLDPC_debug_writeBuffer2File
(
nrLDPC_buffers_LLR_PROC
,
p_procBuf
);
#endif
#ifdef NR_LDPC_PROFILER_DETAIL
start_meas
(
&
p_profiler
->
llr2CnProcBuf
);
#endif
if
(
BG
==
1
)
{
nrLDPC_llr2CnProcBuf_BG1
(
p_lut
,
p_llr
,
p_procBuf
,
Z
);
}
else
{
nrLDPC_llr2CnProcBuf_BG2
(
p_lut
,
p_llr
,
p_procBuf
,
Z
);
}
#ifdef NR_LDPC_PROFILER_DETAIL
stop_meas
(
&
p_profiler
->
llr2CnProcBuf
);
#endif
#ifdef NR_LDPC_DEBUG_MODE
nrLDPC_debug_initBuffer2File
(
nrLDPC_buffers_CN_PROC
);
nrLDPC_debug_writeBuffer2File
(
nrLDPC_buffers_CN_PROC
,
p_procBuf
);
#endif
// First iteration
// CN processing
#ifdef NR_LDPC_PROFILER_DETAIL
start_meas
(
&
p_profiler
->
cnProc
);
#endif
if
(
BG
==
1
)
{
#ifdef __AVX512BW__
nrLDPC_cnProc_BG1_AVX512
(
p_lut
,
p_procBuf
,
Z
);
#else
nrLDPC_cnProc_BG1
(
p_lut
,
p_procBuf
,
Z
);
#endif
}
else
{
nrLDPC_cnProc_BG2
(
p_lut
,
p_procBuf
,
Z
);
}
#ifdef NR_LDPC_PROFILER_DETAIL
stop_meas
(
&
p_profiler
->
cnProc
);
#endif
#ifdef NR_LDPC_DEBUG_MODE
nrLDPC_debug_initBuffer2File
(
nrLDPC_buffers_CN_PROC_RES
);
nrLDPC_debug_writeBuffer2File
(
nrLDPC_buffers_CN_PROC_RES
,
p_procBuf
);
#endif
#ifdef NR_LDPC_PROFILER_DETAIL
start_meas
(
&
p_profiler
->
cn2bnProcBuf
);
#endif
if
(
BG
==
1
)
{
nrLDPC_cn2bnProcBuf_BG1
(
p_lut
,
p_procBuf
,
Z
);
}
else
{
nrLDPC_cn2bnProcBuf_BG2
(
p_lut
,
p_procBuf
,
Z
);
}
#ifdef NR_LDPC_PROFILER_DETAIL
stop_meas
(
&
p_profiler
->
cn2bnProcBuf
);
#endif
#ifdef NR_LDPC_DEBUG_MODE
nrLDPC_debug_initBuffer2File
(
nrLDPC_buffers_BN_PROC
);
nrLDPC_debug_writeBuffer2File
(
nrLDPC_buffers_BN_PROC
,
p_procBuf
);
#endif
// BN processing
#ifdef NR_LDPC_PROFILER_DETAIL
start_meas
(
&
p_profiler
->
bnProcPc
);
#endif
nrLDPC_bnProcPc
(
p_lut
,
p_procBuf
,
Z
);
#ifdef NR_LDPC_PROFILER_DETAIL
stop_meas
(
&
p_profiler
->
bnProcPc
);
#endif
#ifdef NR_LDPC_DEBUG_MODE
nrLDPC_debug_initBuffer2File
(
nrLDPC_buffers_LLR_RES
);
nrLDPC_debug_writeBuffer2File
(
nrLDPC_buffers_LLR_RES
,
p_procBuf
);
#endif
#ifdef NR_LDPC_PROFILER_DETAIL
start_meas
(
&
p_profiler
->
bnProc
);
#endif
nrLDPC_bnProc
(
p_lut
,
p_procBuf
,
Z
);
#ifdef NR_LDPC_PROFILER_DETAIL
stop_meas
(
&
p_profiler
->
bnProc
);
#endif
#ifdef NR_LDPC_DEBUG_MODE
nrLDPC_debug_initBuffer2File
(
nrLDPC_buffers_BN_PROC_RES
);
nrLDPC_debug_writeBuffer2File
(
nrLDPC_buffers_BN_PROC_RES
,
p_procBuf
);
#endif
// BN results to CN processing buffer
#ifdef NR_LDPC_PROFILER_DETAIL
start_meas
(
&
p_profiler
->
bn2cnProcBuf
);
#endif
if
(
BG
==
1
)
{
nrLDPC_bn2cnProcBuf_BG1
(
p_lut
,
p_procBuf
,
Z
);
}
else
{
nrLDPC_bn2cnProcBuf_BG2
(
p_lut
,
p_procBuf
,
Z
);
}
#ifdef NR_LDPC_PROFILER_DETAIL
stop_meas
(
&
p_profiler
->
bn2cnProcBuf
);
#endif
#ifdef NR_LDPC_DEBUG_MODE
nrLDPC_debug_writeBuffer2File
(
nrLDPC_buffers_CN_PROC
,
p_procBuf
);
#endif
// Parity Check not necessary here since it will fail
// because first 2 cols/BNs in BG are punctured and cannot be
// estimated after only one iteration
// First iteration finished
while
(
(
i
<
(
numMaxIter
-
1
))
&&
(
pcRes
!=
0
)
)
{
// Increase iteration counter
i
++
;
// CN processing
#ifdef NR_LDPC_PROFILER_DETAIL
start_meas
(
&
p_profiler
->
cnProc
);
#endif
if
(
BG
==
1
)
{
#ifdef __AVX512BW__
nrLDPC_cnProc_BG1_AVX512
(
p_lut
,
p_procBuf
,
Z
);
#else
nrLDPC_cnProc_BG1
(
p_lut
,
p_procBuf
,
Z
);
#endif
}
else
{
nrLDPC_cnProc_BG2
(
p_lut
,
p_procBuf
,
Z
);
}
#ifdef NR_LDPC_PROFILER_DETAIL
stop_meas
(
&
p_profiler
->
cnProc
);
#endif
#ifdef NR_LDPC_DEBUG_MODE
nrLDPC_debug_writeBuffer2File
(
nrLDPC_buffers_CN_PROC_RES
,
p_procBuf
);
#endif
// Send CN results back to BNs
#ifdef NR_LDPC_PROFILER_DETAIL
start_meas
(
&
p_profiler
->
cn2bnProcBuf
);
#endif
if
(
BG
==
1
)
{
nrLDPC_cn2bnProcBuf_BG1
(
p_lut
,
p_procBuf
,
Z
);
}
else
{
nrLDPC_cn2bnProcBuf_BG2
(
p_lut
,
p_procBuf
,
Z
);
}
#ifdef NR_LDPC_PROFILER_DETAIL
stop_meas
(
&
p_profiler
->
cn2bnProcBuf
);
#endif
#ifdef NR_LDPC_DEBUG_MODE
nrLDPC_debug_writeBuffer2File
(
nrLDPC_buffers_BN_PROC
,
p_procBuf
);
#endif
// BN Processing
#ifdef NR_LDPC_PROFILER_DETAIL
start_meas
(
&
p_profiler
->
bnProcPc
);
#endif
nrLDPC_bnProcPc
(
p_lut
,
p_procBuf
,
Z
);
#ifdef NR_LDPC_PROFILER_DETAIL
stop_meas
(
&
p_profiler
->
bnProcPc
);
#endif
#ifdef NR_LDPC_DEBUG_MODE
nrLDPC_debug_writeBuffer2File
(
nrLDPC_buffers_LLR_RES
,
p_procBuf
);
#endif
#ifdef NR_LDPC_PROFILER_DETAIL
start_meas
(
&
p_profiler
->
bnProc
);
#endif
nrLDPC_bnProc
(
p_lut
,
p_procBuf
,
Z
);
#ifdef NR_LDPC_PROFILER_DETAIL
stop_meas
(
&
p_profiler
->
bnProc
);
#endif
#ifdef NR_LDPC_DEBUG_MODE
nrLDPC_debug_writeBuffer2File
(
nrLDPC_buffers_BN_PROC_RES
,
p_procBuf
);
#endif
// BN results to CN processing buffer
#ifdef NR_LDPC_PROFILER_DETAIL
start_meas
(
&
p_profiler
->
bn2cnProcBuf
);
#endif
if
(
BG
==
1
)
{
nrLDPC_bn2cnProcBuf_BG1
(
p_lut
,
p_procBuf
,
Z
);
}
else
{
nrLDPC_bn2cnProcBuf_BG2
(
p_lut
,
p_procBuf
,
Z
);
}
#ifdef NR_LDPC_PROFILER_DETAIL
stop_meas
(
&
p_profiler
->
bn2cnProcBuf
);
#endif
#ifdef NR_LDPC_DEBUG_MODE
nrLDPC_debug_writeBuffer2File
(
nrLDPC_buffers_CN_PROC
,
p_procBuf
);
#endif
// Parity Check
#ifdef NR_LDPC_ENABLE_PARITY_CHECK
#ifdef NR_LDPC_PROFILER_DETAIL
start_meas
(
&
p_profiler
->
cnProcPc
);
#endif
if
(
BG
==
1
)
{
pcRes
=
nrLDPC_cnProcPc_BG1
(
p_lut
,
p_procBuf
,
Z
);
}
else
{
pcRes
=
nrLDPC_cnProcPc_BG2
(
p_lut
,
p_procBuf
,
Z
);
}
#ifdef NR_LDPC_PROFILER_DETAIL
stop_meas
(
&
p_profiler
->
cnProcPc
);
#endif
#endif
}
// Last iteration
if
(
(
i
<
numMaxIter
)
&&
(
pcRes
!=
0
)
)
{
// Increase iteration counter
i
++
;
// CN processing
#ifdef NR_LDPC_PROFILER_DETAIL
start_meas
(
&
p_profiler
->
cnProc
);
#endif
if
(
BG
==
1
)
{
#ifdef __AVX512BW__
nrLDPC_cnProc_BG1_AVX512
(
p_lut
,
p_procBuf
,
Z
);
#else
nrLDPC_cnProc_BG1
(
p_lut
,
p_procBuf
,
Z
);
#endif
}
else
{
nrLDPC_cnProc_BG2
(
p_lut
,
p_procBuf
,
Z
);
}
#ifdef NR_LDPC_PROFILER_DETAIL
stop_meas
(
&
p_profiler
->
cnProc
);
#endif
#ifdef NR_LDPC_DEBUG_MODE
nrLDPC_debug_writeBuffer2File
(
nrLDPC_buffers_CN_PROC_RES
,
p_procBuf
);
#endif
// Send CN results back to BNs
#ifdef NR_LDPC_PROFILER_DETAIL
start_meas
(
&
p_profiler
->
cn2bnProcBuf
);
#endif
if
(
BG
==
1
)
{
nrLDPC_cn2bnProcBuf_BG1
(
p_lut
,
p_procBuf
,
Z
);
}
else
{
nrLDPC_cn2bnProcBuf_BG2
(
p_lut
,
p_procBuf
,
Z
);
}
#ifdef NR_LDPC_PROFILER_DETAIL
stop_meas
(
&
p_profiler
->
cn2bnProcBuf
);
#endif
#ifdef NR_LDPC_DEBUG_MODE
nrLDPC_debug_writeBuffer2File
(
nrLDPC_buffers_BN_PROC
,
p_procBuf
);
#endif
// BN Processing
#ifdef NR_LDPC_PROFILER_DETAIL
start_meas
(
&
p_profiler
->
bnProcPc
);
#endif
nrLDPC_bnProcPc
(
p_lut
,
p_procBuf
,
Z
);
#ifdef NR_LDPC_PROFILER_DETAIL
stop_meas
(
&
p_profiler
->
bnProcPc
);
#endif
#ifdef NR_LDPC_DEBUG_MODE
nrLDPC_debug_writeBuffer2File
(
nrLDPC_buffers_LLR_RES
,
p_procBuf
);
#endif
// If parity check not enabled, no need to send the BN proc results
// back to CNs
#ifdef NR_LDPC_ENABLE_PARITY_CHECK
#ifdef NR_LDPC_PROFILER_DETAIL
start_meas
(
&
p_profiler
->
bnProc
);
#endif
nrLDPC_bnProc
(
p_lut
,
p_procBuf
,
Z
);
#ifdef NR_LDPC_PROFILER_DETAIL
stop_meas
(
&
p_profiler
->
bnProc
);
#endif
#ifdef NR_LDPC_DEBUG_MODE
nrLDPC_debug_writeBuffer2File
(
nrLDPC_buffers_BN_PROC_RES
,
p_procBuf
);
#endif
// BN results to CN processing buffer
#ifdef NR_LDPC_PROFILER_DETAIL
start_meas
(
&
p_profiler
->
bn2cnProcBuf
);
#endif
if
(
BG
==
1
)
{
nrLDPC_bn2cnProcBuf_BG1
(
p_lut
,
p_procBuf
,
Z
);
}
else
{
nrLDPC_bn2cnProcBuf_BG2
(
p_lut
,
p_procBuf
,
Z
);
}
#ifdef NR_LDPC_PROFILER_DETAIL
stop_meas
(
&
p_profiler
->
bn2cnProcBuf
);
#endif
#ifdef NR_LDPC_DEBUG_MODE
nrLDPC_debug_writeBuffer2File
(
nrLDPC_buffers_CN_PROC
,
p_procBuf
);
#endif
// Parity Check
#ifdef NR_LDPC_PROFILER_DETAIL
start_meas
(
&
p_profiler
->
cnProcPc
);
#endif
if
(
BG
==
1
)
{
pcRes
=
nrLDPC_cnProcPc_BG1
(
p_lut
,
p_procBuf
,
Z
);
}
else
{
pcRes
=
nrLDPC_cnProcPc_BG2
(
p_lut
,
p_procBuf
,
Z
);
}
#ifdef NR_LDPC_PROFILER_DETAIL
stop_meas
(
&
p_profiler
->
cnProcPc
);
#endif
#endif
}
// If maximum number of iterations reached an PC still fails increase number of iterations
// Thus, i > numMaxIter indicates that PC has failed
#ifdef NR_LDPC_ENABLE_PARITY_CHECK
if
(
pcRes
!=
0
)
{
i
++
;
}
#endif
// Assign results from processing buffer to output
#ifdef NR_LDPC_PROFILER_DETAIL
start_meas
(
&
p_profiler
->
llrRes2llrOut
);
#endif
nrLDPC_llrRes2llrOut
(
p_lut
,
p_llrOut
,
p_procBuf
,
Z
,
BG
);
#ifdef NR_LDPC_PROFILER_DETAIL
stop_meas
(
&
p_profiler
->
llrRes2llrOut
);
#endif
// Hard-decision
#ifdef NR_LDPC_PROFILER_DETAIL
start_meas
(
&
p_profiler
->
llr2bit
);
#endif
if
(
outMode
==
nrLDPC_outMode_BIT
)
{
nrLDPC_llr2bitPacked
(
p_out
,
p_llrOut
,
numLLR
);
}
else
if
(
outMode
==
nrLDPC_outMode_BITINT8
)
{
nrLDPC_llr2bit
(
p_out
,
p_llrOut
,
numLLR
);
}
#ifdef NR_LDPC_PROFILER_DETAIL
stop_meas
(
&
p_profiler
->
llr2bit
);
#endif
return
i
;
}
openair1/PHY/CODING/nrLDPC_decoder/nrLDPC_cnProc.h
View file @
f6bb869c
...
...
@@ -39,337 +39,9 @@
\param Z Lifting size
*/
#define conditional_negate(a,b,z) _mm512_mask_sub_epi8(a,_mm512_movepi8_mask(b),z,a)
#ifdef __AVX512BW__
static
inline
void
nrLDPC_cnProc_BG2_AVX512
(
t_nrLDPC_lut
*
p_lut
,
int8_t
*
cnProcBuf
,
int8_t
*
cnProcBufRes
,
uint16_t
Z
)
{
const
uint8_t
*
lut_numCnInCnGroups
=
p_lut
->
numCnInCnGroups
;
const
uint32_t
*
lut_startAddrCnGroups
=
p_lut
->
startAddrCnGroups
;
__m512i
*
p_cnProcBuf
;
__m512i
*
p_cnProcBufRes
;
// Number of CNs in Groups
uint32_t
M
;
uint32_t
i
;
uint32_t
j
;
uint32_t
k
;
// Offset to each bit within a group in terms of 32 Byte
uint32_t
bitOffsetInGroup
;
__m512i
zmm0
,
min
,
sgn
,
zeros
;
zeros
=
_mm512_setzero_si512
();
// maxLLR = _mm512_set1_epi8((char)127);
__m512i
*
p_cnProcBufResBit
;
const
__m512i
*
p_ones
=
(
__m512i
*
)
ones512_epi8
;
const
__m512i
*
p_maxLLR
=
(
__m512i
*
)
maxLLR512_epi8
;
// LUT with offsets for bits that need to be processed
// 1. bit proc requires LLRs of 2. and 3. bit, 2.bits of 1. and 3. etc.
// Offsets are in units of bitOffsetInGroup
const
uint8_t
lut_idxCnProcG3
[
3
][
2
]
=
{{
72
,
144
},
{
0
,
144
},
{
0
,
72
}};
// =====================================================================
// Process group with 3 BNs
if
(
lut_numCnInCnGroups
[
0
]
>
0
)
{
// Number of groups of 32 CNs for parallel processing
// Ceil for values not divisible by 32
M
=
(
lut_numCnInCnGroups
[
0
]
*
Z
+
63
)
>>
6
;
// Set the offset to each bit within a group in terms of 32 Byte
bitOffsetInGroup
=
(
lut_numCnInCnGroups_BG2_R15
[
0
]
*
NR_LDPC_ZMAX
)
>>
6
;
// Set pointers to start of group 3
p_cnProcBuf
=
(
__m512i
*
)
&
cnProcBuf
[
lut_startAddrCnGroups
[
0
]];
p_cnProcBufRes
=
(
__m512i
*
)
&
cnProcBufRes
[
lut_startAddrCnGroups
[
0
]];
// Loop over every BN
for
(
j
=
0
;
j
<
3
;
j
++
)
{
// Set of results pointer to correct BN address
p_cnProcBufResBit
=
p_cnProcBufRes
+
(
j
*
bitOffsetInGroup
);
__m512i
*
pj0
=
&
p_cnProcBuf
[(
lut_idxCnProcG3
[
j
][
0
]
/
2
)];
__m512i
*
pj1
=
&
p_cnProcBuf
[(
lut_idxCnProcG3
[
j
][
1
]
/
2
)];
// Loop over CNs
for
(
i
=
0
;
i
<
M
;
i
++
)
{
// Abs and sign of 32 CNs (first BN)
// zmm0 = p_cnProcBuf[lut_idxCnProcG3[j][0] + i];
zmm0
=
pj0
[
i
];
sgn
=
_mm512_xor_si512
(
*
p_ones
,
zmm0
);
min
=
_mm512_abs_epi8
(
zmm0
);
// 32 CNs of second BN
// zmm0 = p_cnProcBuf[(lut_idxCnProcG3[j][1]/2) + i];
zmm0
=
pj1
[
i
];
min
=
_mm512_min_epu8
(
min
,
_mm512_abs_epi8
(
zmm0
));
sgn
=
_mm512_xor_si512
(
sgn
,
zmm0
);
// Store result
min
=
_mm512_min_epu8
(
min
,
*
p_maxLLR
);
// 128 in epi8 is -127
*
p_cnProcBufResBit
=
conditional_negate
(
min
,
sgn
,
zeros
);
p_cnProcBufResBit
++
;
//p_cnProcBufResBit[i]=_mm512_sign_epi8(min, sgn);
}
}
}
// =====================================================================
// Process group with 4 BNs
// Offset is 20*384/32 = 240
const
uint16_t
lut_idxCnProcG4
[
4
][
3
]
=
{{
240
,
480
,
720
},
{
0
,
480
,
720
},
{
0
,
240
,
720
},
{
0
,
240
,
480
}};
if
(
lut_numCnInCnGroups
[
1
]
>
0
)
{
// Number of groups of 32 CNs for parallel processing
// Ceil for values not divisible by 32
M
=
(
lut_numCnInCnGroups
[
1
]
*
Z
+
63
)
>>
6
;
// Set the offset to each bit within a group in terms of 32 Byte
bitOffsetInGroup
=
(
lut_numCnInCnGroups_BG2_R15
[
1
]
*
NR_LDPC_ZMAX
)
>>
6
;
// Set pointers to start of group 4
p_cnProcBuf
=
(
__m512i
*
)
&
cnProcBuf
[
lut_startAddrCnGroups
[
1
]];
p_cnProcBufRes
=
(
__m512i
*
)
&
cnProcBufRes
[
lut_startAddrCnGroups
[
1
]];
// Loop over every BN
for
(
j
=
0
;
j
<
4
;
j
++
)
{
// Set of results pointer to correct BN address
p_cnProcBufResBit
=
p_cnProcBufRes
+
(
j
*
bitOffsetInGroup
);
// Loop over CNs
for
(
i
=
0
;
i
<
M
;
i
++
)
{
// Abs and sign of 32 CNs (first BN)
zmm0
=
p_cnProcBuf
[(
lut_idxCnProcG4
[
j
][
0
]
/
2
)
+
i
];
sgn
=
_mm512_xor_si512
(
*
p_ones
,
zmm0
);
min
=
_mm512_abs_epi8
(
zmm0
);
// Loop over BNs
for
(
k
=
1
;
k
<
3
;
k
++
)
{
zmm0
=
p_cnProcBuf
[(
lut_idxCnProcG4
[
j
][
k
]
/
2
)
+
i
];
min
=
_mm512_min_epu8
(
min
,
_mm512_abs_epi8
(
zmm0
));
sgn
=
_mm512_xor_si512
(
sgn
,
zmm0
);
}
// Store result
min
=
_mm512_min_epu8
(
min
,
*
p_maxLLR
);
// 128 in epi8 is -127
*
p_cnProcBufResBit
=
conditional_negate
(
min
,
sgn
,
zeros
);
p_cnProcBufResBit
++
;
}
}
}
// =====================================================================
// Process group with 5 BNs
// Offset is 9*384/32 = 108
const
uint16_t
lut_idxCnProcG5
[
5
][
4
]
=
{{
108
,
216
,
324
,
432
},
{
0
,
216
,
324
,
432
},
{
0
,
108
,
324
,
432
},
{
0
,
108
,
216
,
432
},
{
0
,
108
,
216
,
324
}};
if
(
lut_numCnInCnGroups
[
2
]
>
0
)
{
// Number of groups of 32 CNs for parallel processing
// Ceil for values not divisible by 32
M
=
(
lut_numCnInCnGroups
[
2
]
*
Z
+
63
)
>>
6
;
// Set the offset to each bit within a group in terms of 32 Byte
bitOffsetInGroup
=
(
lut_numCnInCnGroups_BG2_R15
[
2
]
*
NR_LDPC_ZMAX
)
>>
6
;
// Set pointers to start of group 5
p_cnProcBuf
=
(
__m512i
*
)
&
cnProcBuf
[
lut_startAddrCnGroups
[
2
]];
p_cnProcBufRes
=
(
__m512i
*
)
&
cnProcBufRes
[
lut_startAddrCnGroups
[
2
]];
// Loop over every BN
for
(
j
=
0
;
j
<
5
;
j
++
)
{
// Set of results pointer to correct BN address
p_cnProcBufResBit
=
p_cnProcBufRes
+
(
j
*
bitOffsetInGroup
);
// Loop over CNs
for
(
i
=
0
;
i
<
M
;
i
++
)
{
// Abs and sign of 32 CNs (first BN)
zmm0
=
p_cnProcBuf
[(
lut_idxCnProcG5
[
j
][
0
]
/
2
)
+
i
];
sgn
=
_mm512_xor_si512
(
*
p_ones
,
zmm0
);
min
=
_mm512_abs_epi8
(
zmm0
);
// Loop over BNs
for
(
k
=
1
;
k
<
4
;
k
++
)
{
zmm0
=
p_cnProcBuf
[(
lut_idxCnProcG5
[
j
][
k
]
/
2
)
+
i
];
min
=
_mm512_min_epu8
(
min
,
_mm512_abs_epi8
(
zmm0
));
sgn
=
_mm512_xor_si512
(
sgn
,
zmm0
);
}
// Store result
min
=
_mm512_min_epu8
(
min
,
*
p_maxLLR
);
// 128 in epi8 is -127
*
p_cnProcBufResBit
=
conditional_negate
(
min
,
sgn
,
zeros
);
p_cnProcBufResBit
++
;
}
}
}
// =====================================================================
// Process group with 6 BNs
// Offset is 3*384/32 = 36
const
uint16_t
lut_idxCnProcG6
[
6
][
5
]
=
{{
36
,
72
,
108
,
144
,
180
},
{
0
,
72
,
108
,
144
,
180
},
{
0
,
36
,
108
,
144
,
180
},
{
0
,
36
,
72
,
144
,
180
},
{
0
,
36
,
72
,
108
,
180
},
{
0
,
36
,
72
,
108
,
144
}};
if
(
lut_numCnInCnGroups
[
3
]
>
0
)
{
// Number of groups of 32 CNs for parallel processing
// Ceil for values not divisible by 32
M
=
(
lut_numCnInCnGroups
[
3
]
*
Z
+
63
)
>>
6
;
// Set the offset to each bit within a group in terms of 32 Byte
bitOffsetInGroup
=
(
lut_numCnInCnGroups_BG2_R15
[
3
]
*
NR_LDPC_ZMAX
)
>>
6
;
// Set pointers to start of group 6
p_cnProcBuf
=
(
__m512i
*
)
&
cnProcBuf
[
lut_startAddrCnGroups
[
3
]];
p_cnProcBufRes
=
(
__m512i
*
)
&
cnProcBufRes
[
lut_startAddrCnGroups
[
3
]];
// Loop over every BN
for
(
j
=
0
;
j
<
6
;
j
++
)
{
// Set of results pointer to correct BN address
p_cnProcBufResBit
=
p_cnProcBufRes
+
(
j
*
bitOffsetInGroup
);
// Loop over CNs
for
(
i
=
0
;
i
<
M
;
i
++
)
{
// Abs and sign of 32 CNs (first BN)
zmm0
=
p_cnProcBuf
[(
lut_idxCnProcG6
[
j
][
0
]
/
2
)
+
i
];
sgn
=
_mm512_xor_si512
(
*
p_ones
,
zmm0
);
min
=
_mm512_abs_epi8
(
zmm0
);
// Loop over BNs
for
(
k
=
1
;
k
<
5
;
k
++
)
{
zmm0
=
p_cnProcBuf
[(
lut_idxCnProcG6
[
j
][
k
]
/
2
)
+
i
];
min
=
_mm512_min_epu8
(
min
,
_mm512_abs_epi8
(
zmm0
));
sgn
=
_mm512_xor_si512
(
sgn
,
zmm0
);
}
// Store result
min
=
_mm512_min_epu8
(
min
,
*
p_maxLLR
);
// 128 in epi8 is -127
*
p_cnProcBufResBit
=
conditional_negate
(
min
,
sgn
,
zeros
);
p_cnProcBufResBit
++
;
}
}
}
// =====================================================================
// Process group with 8 BNs
// Offset is 2*384/32 = 24
const
uint8_t
lut_idxCnProcG8
[
8
][
7
]
=
{{
24
,
48
,
72
,
96
,
120
,
144
,
168
},
{
0
,
48
,
72
,
96
,
120
,
144
,
168
},
{
0
,
24
,
72
,
96
,
120
,
144
,
168
},
{
0
,
24
,
48
,
96
,
120
,
144
,
168
},
{
0
,
24
,
48
,
72
,
120
,
144
,
168
},
{
0
,
24
,
48
,
72
,
96
,
144
,
168
},
{
0
,
24
,
48
,
72
,
96
,
120
,
168
},
{
0
,
24
,
48
,
72
,
96
,
120
,
144
}};
if
(
lut_numCnInCnGroups
[
4
]
>
0
)
{
// Number of groups of 32 CNs for parallel processing
// Ceil for values not divisible by 32
M
=
(
lut_numCnInCnGroups
[
4
]
*
Z
+
63
)
>>
6
;
// Set the offset to each bit within a group in terms of 32 Byte
bitOffsetInGroup
=
(
lut_numCnInCnGroups_BG2_R15
[
4
]
*
NR_LDPC_ZMAX
)
>>
6
;
// Set pointers to start of group 8
p_cnProcBuf
=
(
__m512i
*
)
&
cnProcBuf
[
lut_startAddrCnGroups
[
4
]];
p_cnProcBufRes
=
(
__m512i
*
)
&
cnProcBufRes
[
lut_startAddrCnGroups
[
4
]];
// Loop over every BN
for
(
j
=
0
;
j
<
8
;
j
++
)
{
// Set of results pointer to correct BN address
p_cnProcBufResBit
=
p_cnProcBufRes
+
(
j
*
bitOffsetInGroup
);
// Loop over CNs
for
(
i
=
0
;
i
<
M
;
i
++
)
{
// Abs and sign of 32 CNs (first BN)
zmm0
=
p_cnProcBuf
[(
lut_idxCnProcG8
[
j
][
0
]
/
2
)
+
i
];
sgn
=
_mm512_xor_si512
(
*
p_ones
,
zmm0
);
min
=
_mm512_abs_epi8
(
zmm0
);
// Loop over BNs
for
(
k
=
1
;
k
<
7
;
k
++
)
{
zmm0
=
p_cnProcBuf
[(
lut_idxCnProcG8
[
j
][
k
]
/
2
)
+
i
];
min
=
_mm512_min_epu8
(
min
,
_mm512_abs_epi8
(
zmm0
));
sgn
=
_mm512_xor_si512
(
sgn
,
zmm0
);
}
// Store result
min
=
_mm512_min_epu8
(
min
,
*
p_maxLLR
);
// 128 in epi8 is -127
*
p_cnProcBufResBit
=
conditional_negate
(
min
,
sgn
,
zeros
);
p_cnProcBufResBit
++
;
}
}
}
// =====================================================================
// Process group with 10 BNs
// Offset is 2*384/32 = 24
const
uint8_t
lut_idxCnProcG10
[
10
][
9
]
=
{{
24
,
48
,
72
,
96
,
120
,
144
,
168
,
192
,
216
},
{
0
,
48
,
72
,
96
,
120
,
144
,
168
,
192
,
216
},
{
0
,
24
,
72
,
96
,
120
,
144
,
168
,
192
,
216
},
{
0
,
24
,
48
,
96
,
120
,
144
,
168
,
192
,
216
},
{
0
,
24
,
48
,
72
,
120
,
144
,
168
,
192
,
216
},
{
0
,
24
,
48
,
72
,
96
,
144
,
168
,
192
,
216
},
{
0
,
24
,
48
,
72
,
96
,
120
,
168
,
192
,
216
},
{
0
,
24
,
48
,
72
,
96
,
120
,
144
,
192
,
216
},
{
0
,
24
,
48
,
72
,
96
,
120
,
144
,
168
,
216
},
{
0
,
24
,
48
,
72
,
96
,
120
,
144
,
168
,
192
}};
if
(
lut_numCnInCnGroups
[
5
]
>
0
)
{
// Number of groups of 32 CNs for parallel processing
// Ceil for values not divisible by 32
M
=
(
lut_numCnInCnGroups
[
5
]
*
Z
+
63
)
>>
6
;
// Set the offset to each bit within a group in terms of 32 Byte
bitOffsetInGroup
=
(
lut_numCnInCnGroups_BG2_R15
[
5
]
*
NR_LDPC_ZMAX
)
>>
6
;
// Set pointers to start of group 10
p_cnProcBuf
=
(
__m512i
*
)
&
cnProcBuf
[
lut_startAddrCnGroups
[
5
]];
p_cnProcBufRes
=
(
__m512i
*
)
&
cnProcBufRes
[
lut_startAddrCnGroups
[
5
]];
// Loop over every BN
for
(
j
=
0
;
j
<
10
;
j
++
)
{
// Set of results pointer to correct BN address
p_cnProcBufResBit
=
p_cnProcBufRes
+
(
j
*
bitOffsetInGroup
);
// Loop over CNs
for
(
i
=
0
;
i
<
M
;
i
++
)
{
// Abs and sign of 32 CNs (first BN)
zmm0
=
p_cnProcBuf
[(
lut_idxCnProcG10
[
j
][
0
]
/
2
)
+
i
];
sgn
=
_mm512_xor_si512
(
*
p_ones
,
zmm0
);
min
=
_mm512_abs_epi8
(
zmm0
);
// Loop over BNs
for
(
k
=
1
;
k
<
9
;
k
++
)
{
zmm0
=
p_cnProcBuf
[(
lut_idxCnProcG10
[
j
][
k
]
/
2
)
+
i
];
min
=
_mm512_min_epu8
(
min
,
_mm512_abs_epi8
(
zmm0
));
sgn
=
_mm512_xor_si512
(
sgn
,
zmm0
);
}
// Store result
min
=
_mm512_min_epu8
(
min
,
*
p_maxLLR
);
// 128 in epi8 is -127
*
p_cnProcBufResBit
=
conditional_negate
(
min
,
sgn
,
zeros
);
p_cnProcBufResBit
++
;
}
}
}
}
#include "nrLDPC_cnProc_avx512.h"
#else
...
...
@@ -392,361 +64,13 @@ static inline void nrLDPC_cnProc_BG2(t_nrLDPC_lut* p_lut, int8_t* cnProcBuf, int
__m256i
ymm0
,
min
,
sgn
;
__m256i
*
p_cnProcBufResBit
;
const
__m256i
*
p_ones
=
(
__m256i
*
)
ones256_epi8
;
const
__m256i
*
p_maxLLR
=
(
__m256i
*
)
maxLLR256_epi8
;
// LUT with offsets for bits that need to be processed
// 1. bit proc requires LLRs of 2. and 3. bit, 2.bits of 1. and 3. etc.
// Offsets are in units of bitOffsetInGroup
const
uint8_t
lut_idxCnProcG3
[
3
][
2
]
=
{{
72
,
144
},
{
0
,
144
},
{
0
,
72
}};
// =====================================================================
// Process group with 3 BNs
if
(
lut_numCnInCnGroups
[
0
]
>
0
)
{
// Number of groups of 32 CNs for parallel processing
// Ceil for values not divisible by 32
M
=
(
lut_numCnInCnGroups
[
0
]
*
Z
+
31
)
>>
5
;
// Set the offset to each bit within a group in terms of 32 Byte
bitOffsetInGroup
=
(
lut_numCnInCnGroups_BG2_R15
[
0
]
*
NR_LDPC_ZMAX
)
>>
5
;
// Set pointers to start of group 3
p_cnProcBuf
=
(
__m256i
*
)
&
cnProcBuf
[
lut_startAddrCnGroups
[
0
]];
p_cnProcBufRes
=
(
__m256i
*
)
&
cnProcBufRes
[
lut_startAddrCnGroups
[
0
]];
// Loop over every BN
for
(
j
=
0
;
j
<
3
;
j
++
)
{
// Set of results pointer to correct BN address
p_cnProcBufResBit
=
p_cnProcBufRes
+
(
j
*
bitOffsetInGroup
);
__m256i
*
pj0
=
&
p_cnProcBuf
[
lut_idxCnProcG3
[
j
][
0
]];
__m256i
*
pj1
=
&
p_cnProcBuf
[
lut_idxCnProcG3
[
j
][
1
]];
// Loop over CNs
for
(
i
=
0
;
i
<
M
;
i
++
)
{
// Abs and sign of 32 CNs (first BN)
// ymm0 = p_cnProcBuf[lut_idxCnProcG3[j][0] + i];
ymm0
=
pj0
[
i
];
sgn
=
_mm256_sign_epi8
(
*
p_ones
,
ymm0
);
min
=
_mm256_abs_epi8
(
ymm0
);
// 32 CNs of second BN
// ymm0 = p_cnProcBuf[lut_idxCnProcG3[j][1] + i];
ymm0
=
pj1
[
i
];
min
=
_mm256_min_epu8
(
min
,
_mm256_abs_epi8
(
ymm0
));
sgn
=
_mm256_sign_epi8
(
sgn
,
ymm0
);
// Store result
min
=
_mm256_min_epu8
(
min
,
*
p_maxLLR
);
// 128 in epi8 is -127
*
p_cnProcBufResBit
=
_mm256_sign_epi8
(
min
,
sgn
);
p_cnProcBufResBit
++
;
//p_cnProcBufResBit[i]=_mm256_sign_epi8(min, sgn);
}
}
}
// =====================================================================
// Process group with 4 BNs
// Offset is 20*384/32 = 240
const
uint16_t
lut_idxCnProcG4
[
4
][
3
]
=
{{
240
,
480
,
720
},
{
0
,
480
,
720
},
{
0
,
240
,
720
},
{
0
,
240
,
480
}};
if
(
lut_numCnInCnGroups
[
1
]
>
0
)
{
// Number of groups of 32 CNs for parallel processing
// Ceil for values not divisible by 32
M
=
(
lut_numCnInCnGroups
[
1
]
*
Z
+
31
)
>>
5
;
// Set the offset to each bit within a group in terms of 32 Byte
bitOffsetInGroup
=
(
lut_numCnInCnGroups_BG2_R15
[
1
]
*
NR_LDPC_ZMAX
)
>>
5
;
// Set pointers to start of group 4
p_cnProcBuf
=
(
__m256i
*
)
&
cnProcBuf
[
lut_startAddrCnGroups
[
1
]];
p_cnProcBufRes
=
(
__m256i
*
)
&
cnProcBufRes
[
lut_startAddrCnGroups
[
1
]];
// Loop over every BN
for
(
j
=
0
;
j
<
4
;
j
++
)
{
// Set of results pointer to correct BN address
p_cnProcBufResBit
=
p_cnProcBufRes
+
(
j
*
bitOffsetInGroup
);
// Loop over CNs
for
(
i
=
0
;
i
<
M
;
i
++
)
{
// Abs and sign of 32 CNs (first BN)
ymm0
=
p_cnProcBuf
[
lut_idxCnProcG4
[
j
][
0
]
+
i
];
sgn
=
_mm256_sign_epi8
(
*
p_ones
,
ymm0
);
min
=
_mm256_abs_epi8
(
ymm0
);
// Loop over BNs
for
(
k
=
1
;
k
<
3
;
k
++
)
{
ymm0
=
p_cnProcBuf
[
lut_idxCnProcG4
[
j
][
k
]
+
i
];
min
=
_mm256_min_epu8
(
min
,
_mm256_abs_epi8
(
ymm0
));
sgn
=
_mm256_sign_epi8
(
sgn
,
ymm0
);
}
// Store result
min
=
_mm256_min_epu8
(
min
,
*
p_maxLLR
);
// 128 in epi8 is -127
*
p_cnProcBufResBit
=
_mm256_sign_epi8
(
min
,
sgn
);
p_cnProcBufResBit
++
;
}
}
}
// =====================================================================
// Process group with 5 BNs
// Offset is 9*384/32 = 108
const
uint16_t
lut_idxCnProcG5
[
5
][
4
]
=
{{
108
,
216
,
324
,
432
},
{
0
,
216
,
324
,
432
},
{
0
,
108
,
324
,
432
},
{
0
,
108
,
216
,
432
},
{
0
,
108
,
216
,
324
}};
if
(
lut_numCnInCnGroups
[
2
]
>
0
)
{
// Number of groups of 32 CNs for parallel processing
// Ceil for values not divisible by 32
M
=
(
lut_numCnInCnGroups
[
2
]
*
Z
+
31
)
>>
5
;
// Set the offset to each bit within a group in terms of 32 Byte
bitOffsetInGroup
=
(
lut_numCnInCnGroups_BG2_R15
[
2
]
*
NR_LDPC_ZMAX
)
>>
5
;
// Set pointers to start of group 5
p_cnProcBuf
=
(
__m256i
*
)
&
cnProcBuf
[
lut_startAddrCnGroups
[
2
]];
p_cnProcBufRes
=
(
__m256i
*
)
&
cnProcBufRes
[
lut_startAddrCnGroups
[
2
]];
// Loop over every BN
for
(
j
=
0
;
j
<
5
;
j
++
)
{
// Set of results pointer to correct BN address
p_cnProcBufResBit
=
p_cnProcBufRes
+
(
j
*
bitOffsetInGroup
);
// Loop over CNs
for
(
i
=
0
;
i
<
M
;
i
++
)
{
// Abs and sign of 32 CNs (first BN)
ymm0
=
p_cnProcBuf
[
lut_idxCnProcG5
[
j
][
0
]
+
i
];
sgn
=
_mm256_sign_epi8
(
*
p_ones
,
ymm0
);
min
=
_mm256_abs_epi8
(
ymm0
);
// Loop over BNs
for
(
k
=
1
;
k
<
4
;
k
++
)
{
ymm0
=
p_cnProcBuf
[
lut_idxCnProcG5
[
j
][
k
]
+
i
];
min
=
_mm256_min_epu8
(
min
,
_mm256_abs_epi8
(
ymm0
));
sgn
=
_mm256_sign_epi8
(
sgn
,
ymm0
);
}
// Store result
min
=
_mm256_min_epu8
(
min
,
*
p_maxLLR
);
// 128 in epi8 is -127
*
p_cnProcBufResBit
=
_mm256_sign_epi8
(
min
,
sgn
);
p_cnProcBufResBit
++
;
}
}
}
// =====================================================================
// Process group with 6 BNs
// Offset is 3*384/32 = 36
const
uint16_t
lut_idxCnProcG6
[
6
][
5
]
=
{{
36
,
72
,
108
,
144
,
180
},
{
0
,
72
,
108
,
144
,
180
},
{
0
,
36
,
108
,
144
,
180
},
{
0
,
36
,
72
,
144
,
180
},
{
0
,
36
,
72
,
108
,
180
},
{
0
,
36
,
72
,
108
,
144
}};
if
(
lut_numCnInCnGroups
[
3
]
>
0
)
{
// Number of groups of 32 CNs for parallel processing
// Ceil for values not divisible by 32
M
=
(
lut_numCnInCnGroups
[
3
]
*
Z
+
31
)
>>
5
;
// Set the offset to each bit within a group in terms of 32 Byte
bitOffsetInGroup
=
(
lut_numCnInCnGroups_BG2_R15
[
3
]
*
NR_LDPC_ZMAX
)
>>
5
;
// Set pointers to start of group 6
p_cnProcBuf
=
(
__m256i
*
)
&
cnProcBuf
[
lut_startAddrCnGroups
[
3
]];
p_cnProcBufRes
=
(
__m256i
*
)
&
cnProcBufRes
[
lut_startAddrCnGroups
[
3
]];
// Loop over every BN
for
(
j
=
0
;
j
<
6
;
j
++
)
{
// Set of results pointer to correct BN address
p_cnProcBufResBit
=
p_cnProcBufRes
+
(
j
*
bitOffsetInGroup
);
// Loop over CNs
for
(
i
=
0
;
i
<
M
;
i
++
)
{
// Abs and sign of 32 CNs (first BN)
ymm0
=
p_cnProcBuf
[
lut_idxCnProcG6
[
j
][
0
]
+
i
];
sgn
=
_mm256_sign_epi8
(
*
p_ones
,
ymm0
);
min
=
_mm256_abs_epi8
(
ymm0
);
// Loop over BNs
for
(
k
=
1
;
k
<
5
;
k
++
)
{
ymm0
=
p_cnProcBuf
[
lut_idxCnProcG6
[
j
][
k
]
+
i
];
min
=
_mm256_min_epu8
(
min
,
_mm256_abs_epi8
(
ymm0
));
sgn
=
_mm256_sign_epi8
(
sgn
,
ymm0
);
}
// Store result
min
=
_mm256_min_epu8
(
min
,
*
p_maxLLR
);
// 128 in epi8 is -127
*
p_cnProcBufResBit
=
_mm256_sign_epi8
(
min
,
sgn
);
p_cnProcBufResBit
++
;
}
}
}
// =====================================================================
// Process group with 8 BNs
// Offset is 2*384/32 = 24
const
uint8_t
lut_idxCnProcG8
[
8
][
7
]
=
{{
24
,
48
,
72
,
96
,
120
,
144
,
168
},
{
0
,
48
,
72
,
96
,
120
,
144
,
168
},
{
0
,
24
,
72
,
96
,
120
,
144
,
168
},
{
0
,
24
,
48
,
96
,
120
,
144
,
168
},
{
0
,
24
,
48
,
72
,
120
,
144
,
168
},
{
0
,
24
,
48
,
72
,
96
,
144
,
168
},
{
0
,
24
,
48
,
72
,
96
,
120
,
168
},
{
0
,
24
,
48
,
72
,
96
,
120
,
144
}};
if
(
lut_numCnInCnGroups
[
4
]
>
0
)
{
// Number of groups of 32 CNs for parallel processing
// Ceil for values not divisible by 32
M
=
(
lut_numCnInCnGroups
[
4
]
*
Z
+
31
)
>>
5
;
// Set the offset to each bit within a group in terms of 32 Byte
bitOffsetInGroup
=
(
lut_numCnInCnGroups_BG2_R15
[
4
]
*
NR_LDPC_ZMAX
)
>>
5
;
// Set pointers to start of group 8
p_cnProcBuf
=
(
__m256i
*
)
&
cnProcBuf
[
lut_startAddrCnGroups
[
4
]];
p_cnProcBufRes
=
(
__m256i
*
)
&
cnProcBufRes
[
lut_startAddrCnGroups
[
4
]];
// Loop over every BN
for
(
j
=
0
;
j
<
8
;
j
++
)
{
// Set of results pointer to correct BN address
p_cnProcBufResBit
=
p_cnProcBufRes
+
(
j
*
bitOffsetInGroup
);
// Loop over CNs
for
(
i
=
0
;
i
<
M
;
i
++
)
{
// Abs and sign of 32 CNs (first BN)
ymm0
=
p_cnProcBuf
[
lut_idxCnProcG8
[
j
][
0
]
+
i
];
sgn
=
_mm256_sign_epi8
(
*
p_ones
,
ymm0
);
min
=
_mm256_abs_epi8
(
ymm0
);
// Loop over BNs
for
(
k
=
1
;
k
<
7
;
k
++
)
{
ymm0
=
p_cnProcBuf
[
lut_idxCnProcG8
[
j
][
k
]
+
i
];
min
=
_mm256_min_epu8
(
min
,
_mm256_abs_epi8
(
ymm0
));
sgn
=
_mm256_sign_epi8
(
sgn
,
ymm0
);
}
// Store result
min
=
_mm256_min_epu8
(
min
,
*
p_maxLLR
);
// 128 in epi8 is -127
*
p_cnProcBufResBit
=
_mm256_sign_epi8
(
min
,
sgn
);
p_cnProcBufResBit
++
;
}
}
}
// =====================================================================
// Process group with 10 BNs
// Offset is 2*384/32 = 24
const
uint8_t
lut_idxCnProcG10
[
10
][
9
]
=
{{
24
,
48
,
72
,
96
,
120
,
144
,
168
,
192
,
216
},
{
0
,
48
,
72
,
96
,
120
,
144
,
168
,
192
,
216
},
{
0
,
24
,
72
,
96
,
120
,
144
,
168
,
192
,
216
},
{
0
,
24
,
48
,
96
,
120
,
144
,
168
,
192
,
216
},
{
0
,
24
,
48
,
72
,
120
,
144
,
168
,
192
,
216
},
{
0
,
24
,
48
,
72
,
96
,
144
,
168
,
192
,
216
},
{
0
,
24
,
48
,
72
,
96
,
120
,
168
,
192
,
216
},
{
0
,
24
,
48
,
72
,
96
,
120
,
144
,
192
,
216
},
{
0
,
24
,
48
,
72
,
96
,
120
,
144
,
168
,
216
},
{
0
,
24
,
48
,
72
,
96
,
120
,
144
,
168
,
192
}};
if
(
lut_numCnInCnGroups
[
5
]
>
0
)
{
// Number of groups of 32 CNs for parallel processing
// Ceil for values not divisible by 32
M
=
(
lut_numCnInCnGroups
[
5
]
*
Z
+
31
)
>>
5
;
// Set the offset to each bit within a group in terms of 32 Byte
bitOffsetInGroup
=
(
lut_numCnInCnGroups_BG2_R15
[
5
]
*
NR_LDPC_ZMAX
)
>>
5
;
// Set pointers to start of group 10
p_cnProcBuf
=
(
__m256i
*
)
&
cnProcBuf
[
lut_startAddrCnGroups
[
5
]];
p_cnProcBufRes
=
(
__m256i
*
)
&
cnProcBufRes
[
lut_startAddrCnGroups
[
5
]];
// Loop over every BN
for
(
j
=
0
;
j
<
10
;
j
++
)
{
// Set of results pointer to correct BN address
p_cnProcBufResBit
=
p_cnProcBufRes
+
(
j
*
bitOffsetInGroup
);
// Loop over CNs
for
(
i
=
0
;
i
<
M
;
i
++
)
{
// Abs and sign of 32 CNs (first BN)
ymm0
=
p_cnProcBuf
[
lut_idxCnProcG10
[
j
][
0
]
+
i
];
sgn
=
_mm256_sign_epi8
(
*
p_ones
,
ymm0
);
min
=
_mm256_abs_epi8
(
ymm0
);
// Loop over BNs
for
(
k
=
1
;
k
<
9
;
k
++
)
{
ymm0
=
p_cnProcBuf
[
lut_idxCnProcG10
[
j
][
k
]
+
i
];
min
=
_mm256_min_epu8
(
min
,
_mm256_abs_epi8
(
ymm0
));
sgn
=
_mm256_sign_epi8
(
sgn
,
ymm0
);
}
// Store result
min
=
_mm256_min_epu8
(
min
,
*
p_maxLLR
);
// 128 in epi8 is -127
*
p_cnProcBufResBit
=
_mm256_sign_epi8
(
min
,
sgn
);
p_cnProcBufResBit
++
;
}
}
}
}
#endif
/**
\brief Performs CN processing for BG1 on the CN processing buffer and stores the results in the CN processing results buffer.
\param p_lut Pointer to decoder LUTs
\param Z Lifting size
*/
#ifdef __AVX512BW__
static
inline
void
nrLDPC_cnProc_BG1_AVX512
(
t_nrLDPC_lut
*
p_lut
,
t_nrLDPC_procBuf
*
p_procBuf
,
uint16_t
Z
)
{
const
uint8_t
*
lut_numCnInCnGroups
=
p_lut
->
numCnInCnGroups
;
const
uint32_t
*
lut_startAddrCnGroups
=
p_lut
->
startAddrCnGroups
;
int8_t
*
cnProcBuf
=
p_procBuf
->
cnProcBuf
;
int8_t
*
cnProcBufRes
=
p_procBuf
->
cnProcBufRes
;
__m512i
*
p_cnProcBuf
;
__m512i
*
p_cnProcBufRes
;
// Number of CNs in Groups
uint32_t
M
;
uint32_t
i
;
uint32_t
j
;
uint32_t
k
;
// Offset to each bit within a group in terms of 32 Byte
uint32_t
bitOffsetInGroup
;
__m512i
zmm0
,
min
,
sgn
,
zeros
;
zeros
=
_mm512_setzero_si512
();
// maxLLR = _mm512_set1_epi8((char)127);
__m512i
*
p_cnProcBufResBit
;
const
__m512i
*
p_ones
=
(
__m512i
*
)
ones512_epi8
;
const
__m512i
*
p_maxLLR
=
(
__m512i
*
)
maxLLR512_epi8
;
const
__m256i
*
p_ones
=
(
__m256i
*
)
ones256_epi8
;
const
__m256i
*
p_maxLLR
=
(
__m256i
*
)
maxLLR256_epi8
;
// LUT with offsets for bits that need to be processed
// 1. bit proc requires LLRs of 2. and 3. bit, 2.bits of 1. and 3. etc.
// Offsets are in units of bitOffsetInGroup
(1*384/32)
const
uint8_t
lut_idxCnProcG3
[
3
][
2
]
=
{{
12
,
24
},
{
0
,
24
},
{
0
,
1
2
}};
// Offsets are in units of bitOffsetInGroup
const
uint8_t
lut_idxCnProcG3
[
3
][
2
]
=
{{
72
,
144
},
{
0
,
144
},
{
0
,
7
2
}};
// =====================================================================
// Process group with 3 BNs
...
...
@@ -755,14 +79,13 @@ static inline void nrLDPC_cnProc_BG1_AVX512(t_nrLDPC_lut* p_lut, t_nrLDPC_procBu
{
// Number of groups of 32 CNs for parallel processing
// Ceil for values not divisible by 32
M
=
(
lut_numCnInCnGroups
[
0
]
*
Z
+
63
)
>>
6
;
M
=
(
lut_numCnInCnGroups
[
0
]
*
Z
+
31
)
>>
5
;
// Set the offset to each bit within a group in terms of 32 Byte
bitOffsetInGroup
=
(
lut_numCnInCnGroups_BG
1_R13
[
0
]
*
NR_LDPC_ZMAX
)
>>
6
;
bitOffsetInGroup
=
(
lut_numCnInCnGroups_BG
2_R15
[
0
]
*
NR_LDPC_ZMAX
)
>>
5
;
// Set pointers to start of group 3
p_cnProcBuf
=
(
__m
512
i
*
)
&
cnProcBuf
[
lut_startAddrCnGroups
[
0
]];
p_cnProcBufRes
=
(
__m
512
i
*
)
&
cnProcBufRes
[
lut_startAddrCnGroups
[
0
]];
p_cnProcBuf
=
(
__m
256
i
*
)
&
cnProcBuf
[
lut_startAddrCnGroups
[
0
]];
p_cnProcBufRes
=
(
__m
256
i
*
)
&
cnProcBufRes
[
lut_startAddrCnGroups
[
0
]];
// Loop over every BN
for
(
j
=
0
;
j
<
3
;
j
++
)
...
...
@@ -770,24 +93,29 @@ static inline void nrLDPC_cnProc_BG1_AVX512(t_nrLDPC_lut* p_lut, t_nrLDPC_procBu
// Set of results pointer to correct BN address
p_cnProcBufResBit
=
p_cnProcBufRes
+
(
j
*
bitOffsetInGroup
);
__m256i
*
pj0
=
&
p_cnProcBuf
[
lut_idxCnProcG3
[
j
][
0
]];
__m256i
*
pj1
=
&
p_cnProcBuf
[
lut_idxCnProcG3
[
j
][
1
]];
// Loop over CNs
for
(
i
=
0
;
i
<
M
;
i
++
)
{
// Abs and sign of 32 CNs (first BN)
zmm0
=
p_cnProcBuf
[(
lut_idxCnProcG3
[
j
][
0
]
/
2
)
+
i
];
sgn
=
_mm512_xor_si512
(
*
p_ones
,
zmm0
);
min
=
_mm512_abs_epi8
(
zmm0
);
// ymm0 = p_cnProcBuf[lut_idxCnProcG3[j][0] + i];
ymm0
=
pj0
[
i
];
sgn
=
_mm256_sign_epi8
(
*
p_ones
,
ymm0
);
min
=
_mm256_abs_epi8
(
ymm0
);
// 32 CNs of second BN
zmm0
=
p_cnProcBuf
[(
lut_idxCnProcG3
[
j
][
1
]
/
2
)
+
i
];
min
=
_mm512_min_epu8
(
min
,
_mm512_abs_epi8
(
zmm0
))
;
sgn
=
_mm512_xor_si512
(
sgn
,
zmm0
);
// ymm0 = p_cnProcBuf[lut_idxCnProcG3[j][1]
+ i];
ymm0
=
pj1
[
i
]
;
min
=
_mm256_min_epu8
(
min
,
_mm256_abs_epi8
(
ymm0
)
);
sgn
=
_mm256_sign_epi8
(
sgn
,
ymm0
);
// Store result
min
=
_mm512_min_epu8
(
min
,
*
p_maxLLR
);
// 128 in epi8 is -127
*
p_cnProcBufResBit
=
conditional_negate
(
min
,
sgn
,
zeros
);
p_cnProcBufResBit
++
;
min
=
_mm256_min_epu8
(
min
,
*
p_maxLLR
);
// 128 in epi8 is -127
//*p_cnProcBufResBit = _mm256_sign_epi8(min, sgn);
//p_cnProcBufResBit++;
p_cnProcBufResBit
[
i
]
=
_mm256_sign_epi8
(
min
,
sgn
);
}
}
}
...
...
@@ -795,21 +123,20 @@ static inline void nrLDPC_cnProc_BG1_AVX512(t_nrLDPC_lut* p_lut, t_nrLDPC_procBu
// =====================================================================
// Process group with 4 BNs
// Offset is
5*384/32 = 6
0
const
uint
8_t
lut_idxCnProcG4
[
4
][
3
]
=
{{
60
,
120
,
180
},
{
0
,
120
,
180
},
{
0
,
60
,
180
},
{
0
,
60
,
12
0
}};
// Offset is
20*384/32 = 24
0
const
uint
16_t
lut_idxCnProcG4
[
4
][
3
]
=
{{
240
,
480
,
720
},
{
0
,
480
,
720
},
{
0
,
240
,
720
},
{
0
,
240
,
48
0
}};
if
(
lut_numCnInCnGroups
[
1
]
>
0
)
{
// Number of groups of 32 CNs for parallel processing
// Ceil for values not divisible by 32
M
=
(
lut_numCnInCnGroups
[
1
]
*
Z
+
63
)
>>
6
;
M
=
(
lut_numCnInCnGroups
[
1
]
*
Z
+
31
)
>>
5
;
// Set the offset to each bit within a group in terms of 32 Byte
bitOffsetInGroup
=
(
lut_numCnInCnGroups_BG
1_R13
[
1
]
*
NR_LDPC_ZMAX
)
>>
6
;
bitOffsetInGroup
=
(
lut_numCnInCnGroups_BG
2_R15
[
1
]
*
NR_LDPC_ZMAX
)
>>
5
;
// Set pointers to start of group 4
p_cnProcBuf
=
(
__m
512
i
*
)
&
cnProcBuf
[
lut_startAddrCnGroups
[
1
]];
p_cnProcBufRes
=
(
__m
512
i
*
)
&
cnProcBufRes
[
lut_startAddrCnGroups
[
1
]];
p_cnProcBuf
=
(
__m
256
i
*
)
&
cnProcBuf
[
lut_startAddrCnGroups
[
1
]];
p_cnProcBufRes
=
(
__m
256
i
*
)
&
cnProcBufRes
[
lut_startAddrCnGroups
[
1
]];
// Loop over every BN
for
(
j
=
0
;
j
<
4
;
j
++
)
...
...
@@ -821,21 +148,21 @@ static inline void nrLDPC_cnProc_BG1_AVX512(t_nrLDPC_lut* p_lut, t_nrLDPC_procBu
for
(
i
=
0
;
i
<
M
;
i
++
)
{
// Abs and sign of 32 CNs (first BN)
zmm0
=
p_cnProcBuf
[(
lut_idxCnProcG4
[
j
][
0
]
/
2
)
+
i
];
sgn
=
_mm
512_xor_si512
(
*
p_ones
,
z
mm0
);
min
=
_mm
512_abs_epi8
(
z
mm0
);
ymm0
=
p_cnProcBuf
[
lut_idxCnProcG4
[
j
][
0
]
+
i
];
sgn
=
_mm
256_sign_epi8
(
*
p_ones
,
y
mm0
);
min
=
_mm
256_abs_epi8
(
y
mm0
);
// Loop over BNs
for
(
k
=
1
;
k
<
3
;
k
++
)
{
zmm0
=
p_cnProcBuf
[(
lut_idxCnProcG4
[
j
][
k
]
/
2
)
+
i
];
min
=
_mm
512_min_epu8
(
min
,
_mm512_abs_epi8
(
z
mm0
));
sgn
=
_mm512_xor_si512
(
sgn
,
z
mm0
);
ymm0
=
p_cnProcBuf
[
lut_idxCnProcG4
[
j
][
k
]
+
i
];
min
=
_mm
256_min_epu8
(
min
,
_mm256_abs_epi8
(
y
mm0
));
sgn
=
_mm256_sign_epi8
(
sgn
,
y
mm0
);
}
// Store result
min
=
_mm
512
_min_epu8
(
min
,
*
p_maxLLR
);
// 128 in epi8 is -127
*
p_cnProcBufResBit
=
conditional_negate
(
min
,
sgn
,
zeros
);
min
=
_mm
256
_min_epu8
(
min
,
*
p_maxLLR
);
// 128 in epi8 is -127
*
p_cnProcBufResBit
=
_mm256_sign_epi8
(
min
,
sgn
);
p_cnProcBufResBit
++
;
}
}
...
...
@@ -844,22 +171,21 @@ static inline void nrLDPC_cnProc_BG1_AVX512(t_nrLDPC_lut* p_lut, t_nrLDPC_procBu
// =====================================================================
// Process group with 5 BNs
// Offset is
18*384/32 = 216
const
uint16_t
lut_idxCnProcG5
[
5
][
4
]
=
{{
216
,
432
,
648
,
864
},
{
0
,
432
,
648
,
864
},
{
0
,
216
,
648
,
864
},
{
0
,
216
,
432
,
864
},
{
0
,
216
,
432
,
648
}};
// Offset is
9*384/32 = 108
const
uint16_t
lut_idxCnProcG5
[
5
][
4
]
=
{{
108
,
216
,
324
,
432
},
{
0
,
216
,
324
,
432
},
{
0
,
108
,
324
,
432
},
{
0
,
108
,
216
,
432
},
{
0
,
108
,
216
,
324
}};
if
(
lut_numCnInCnGroups
[
2
]
>
0
)
{
// Number of groups of 32 CNs for parallel processing
// Ceil for values not divisible by 32
M
=
(
lut_numCnInCnGroups
[
2
]
*
Z
+
63
)
>>
6
;
M
=
(
lut_numCnInCnGroups
[
2
]
*
Z
+
31
)
>>
5
;
// Set the offset to each bit within a group in terms of 32 Byte
bitOffsetInGroup
=
(
lut_numCnInCnGroups_BG
1_R13
[
2
]
*
NR_LDPC_ZMAX
)
>>
6
;
bitOffsetInGroup
=
(
lut_numCnInCnGroups_BG
2_R15
[
2
]
*
NR_LDPC_ZMAX
)
>>
5
;
// Set pointers to start of group 5
p_cnProcBuf
=
(
__m
512
i
*
)
&
cnProcBuf
[
lut_startAddrCnGroups
[
2
]];
p_cnProcBufRes
=
(
__m
512
i
*
)
&
cnProcBufRes
[
lut_startAddrCnGroups
[
2
]];
p_cnProcBuf
=
(
__m
256
i
*
)
&
cnProcBuf
[
lut_startAddrCnGroups
[
2
]];
p_cnProcBufRes
=
(
__m
256
i
*
)
&
cnProcBufRes
[
lut_startAddrCnGroups
[
2
]];
// Loop over every BN
for
(
j
=
0
;
j
<
5
;
j
++
)
...
...
@@ -871,21 +197,21 @@ static inline void nrLDPC_cnProc_BG1_AVX512(t_nrLDPC_lut* p_lut, t_nrLDPC_procBu
for
(
i
=
0
;
i
<
M
;
i
++
)
{
// Abs and sign of 32 CNs (first BN)
zmm0
=
p_cnProcBuf
[(
lut_idxCnProcG5
[
j
][
0
]
/
2
)
+
i
];
sgn
=
_mm
512_xor_si512
(
*
p_ones
,
z
mm0
);
min
=
_mm
512_abs_epi8
(
z
mm0
);
ymm0
=
p_cnProcBuf
[
lut_idxCnProcG5
[
j
][
0
]
+
i
];
sgn
=
_mm
256_sign_epi8
(
*
p_ones
,
y
mm0
);
min
=
_mm
256_abs_epi8
(
y
mm0
);
// Loop over BNs
for
(
k
=
1
;
k
<
4
;
k
++
)
{
zmm0
=
p_cnProcBuf
[(
lut_idxCnProcG5
[
j
][
k
]
/
2
)
+
i
];
min
=
_mm
512_min_epu8
(
min
,
_mm512_abs_epi8
(
z
mm0
));
sgn
=
_mm512_xor_si512
(
sgn
,
z
mm0
);
ymm0
=
p_cnProcBuf
[
lut_idxCnProcG5
[
j
][
k
]
+
i
];
min
=
_mm
256_min_epu8
(
min
,
_mm256_abs_epi8
(
y
mm0
));
sgn
=
_mm256_sign_epi8
(
sgn
,
y
mm0
);
}
// Store result
min
=
_mm
512
_min_epu8
(
min
,
*
p_maxLLR
);
// 128 in epi8 is -127
*
p_cnProcBufResBit
=
conditional_negate
(
min
,
sgn
,
zeros
);
min
=
_mm
256
_min_epu8
(
min
,
*
p_maxLLR
);
// 128 in epi8 is -127
*
p_cnProcBufResBit
=
_mm256_sign_epi8
(
min
,
sgn
);
p_cnProcBufResBit
++
;
}
}
...
...
@@ -894,23 +220,22 @@ static inline void nrLDPC_cnProc_BG1_AVX512(t_nrLDPC_lut* p_lut, t_nrLDPC_procBu
// =====================================================================
// Process group with 6 BNs
// Offset is
8*384/32 = 9
6
const
uint16_t
lut_idxCnProcG6
[
6
][
5
]
=
{{
96
,
192
,
288
,
384
,
480
},
{
0
,
192
,
288
,
384
,
4
80
},
{
0
,
96
,
288
,
384
,
480
},
{
0
,
96
,
192
,
384
,
4
80
},
{
0
,
96
,
192
,
288
,
480
},
{
0
,
96
,
192
,
288
,
38
4
}};
// Offset is
3*384/32 = 3
6
const
uint16_t
lut_idxCnProcG6
[
6
][
5
]
=
{{
36
,
72
,
108
,
144
,
180
},
{
0
,
72
,
108
,
144
,
1
80
},
{
0
,
36
,
108
,
144
,
180
},
{
0
,
36
,
72
,
144
,
1
80
},
{
0
,
36
,
72
,
108
,
180
},
{
0
,
36
,
72
,
108
,
14
4
}};
if
(
lut_numCnInCnGroups
[
3
]
>
0
)
{
// Number of groups of 32 CNs for parallel processing
// Ceil for values not divisible by 32
M
=
(
lut_numCnInCnGroups
[
3
]
*
Z
+
63
)
>>
6
;
M
=
(
lut_numCnInCnGroups
[
3
]
*
Z
+
31
)
>>
5
;
// Set the offset to each bit within a group in terms of 32 Byte
bitOffsetInGroup
=
(
lut_numCnInCnGroups_BG
1_R13
[
3
]
*
NR_LDPC_ZMAX
)
>>
6
;
bitOffsetInGroup
=
(
lut_numCnInCnGroups_BG
2_R15
[
3
]
*
NR_LDPC_ZMAX
)
>>
5
;
// Set pointers to start of group 6
p_cnProcBuf
=
(
__m
512
i
*
)
&
cnProcBuf
[
lut_startAddrCnGroups
[
3
]];
p_cnProcBufRes
=
(
__m
512
i
*
)
&
cnProcBufRes
[
lut_startAddrCnGroups
[
3
]];
p_cnProcBuf
=
(
__m
256
i
*
)
&
cnProcBuf
[
lut_startAddrCnGroups
[
3
]];
p_cnProcBufRes
=
(
__m
256
i
*
)
&
cnProcBufRes
[
lut_startAddrCnGroups
[
3
]];
// Loop over every BN
for
(
j
=
0
;
j
<
6
;
j
++
)
...
...
@@ -922,73 +247,21 @@ static inline void nrLDPC_cnProc_BG1_AVX512(t_nrLDPC_lut* p_lut, t_nrLDPC_procBu
for
(
i
=
0
;
i
<
M
;
i
++
)
{
// Abs and sign of 32 CNs (first BN)
zmm0
=
p_cnProcBuf
[(
lut_idxCnProcG6
[
j
][
0
]
/
2
)
+
i
];
sgn
=
_mm
512_xor_si512
(
*
p_ones
,
z
mm0
);
min
=
_mm
512_abs_epi8
(
z
mm0
);
ymm0
=
p_cnProcBuf
[
lut_idxCnProcG6
[
j
][
0
]
+
i
];
sgn
=
_mm
256_sign_epi8
(
*
p_ones
,
y
mm0
);
min
=
_mm
256_abs_epi8
(
y
mm0
);
// Loop over BNs
for
(
k
=
1
;
k
<
5
;
k
++
)
{
zmm0
=
p_cnProcBuf
[(
lut_idxCnProcG6
[
j
][
k
]
/
2
)
+
i
];
min
=
_mm512_min_epu8
(
min
,
_mm512_abs_epi8
(
zmm0
));
sgn
=
_mm512_xor_si512
(
sgn
,
zmm0
);
}
// Store result
min
=
_mm512_min_epu8
(
min
,
*
p_maxLLR
);
// 128 in epi8 is -127
*
p_cnProcBufResBit
=
conditional_negate
(
min
,
sgn
,
zeros
);
p_cnProcBufResBit
++
;
}
}
}
// =====================================================================
// Process group with 7 BNs
// Offset is 5*384/32 = 60
const
uint16_t
lut_idxCnProcG7
[
7
][
6
]
=
{{
60
,
120
,
180
,
240
,
300
,
360
},
{
0
,
120
,
180
,
240
,
300
,
360
},
{
0
,
60
,
180
,
240
,
300
,
360
},
{
0
,
60
,
120
,
240
,
300
,
360
},
{
0
,
60
,
120
,
180
,
300
,
360
},
{
0
,
60
,
120
,
180
,
240
,
360
},
{
0
,
60
,
120
,
180
,
240
,
300
}};
if
(
lut_numCnInCnGroups
[
4
]
>
0
)
{
// Number of groups of 32 CNs for parallel processing
// Ceil for values not divisible by 32
M
=
(
lut_numCnInCnGroups
[
4
]
*
Z
+
63
)
>>
6
;
// Set the offset to each bit within a group in terms of 32 Byte
bitOffsetInGroup
=
(
lut_numCnInCnGroups_BG1_R13
[
4
]
*
NR_LDPC_ZMAX
)
>>
6
;
// Set pointers to start of group 7
p_cnProcBuf
=
(
__m512i
*
)
&
cnProcBuf
[
lut_startAddrCnGroups
[
4
]];
p_cnProcBufRes
=
(
__m512i
*
)
&
cnProcBufRes
[
lut_startAddrCnGroups
[
4
]];
// Loop over every BN
for
(
j
=
0
;
j
<
7
;
j
++
)
{
// Set of results pointer to correct BN address
p_cnProcBufResBit
=
p_cnProcBufRes
+
(
j
*
bitOffsetInGroup
);
// Loop over CNs
for
(
i
=
0
;
i
<
M
;
i
++
)
{
// Abs and sign of 32 CNs (first BN)
zmm0
=
p_cnProcBuf
[(
lut_idxCnProcG7
[
j
][
0
]
/
2
)
+
i
];
sgn
=
_mm512_xor_si512
(
*
p_ones
,
zmm0
);
min
=
_mm512_abs_epi8
(
zmm0
);
// Loop over BNs
for
(
k
=
1
;
k
<
6
;
k
++
)
{
zmm0
=
p_cnProcBuf
[(
lut_idxCnProcG7
[
j
][
k
]
/
2
)
+
i
];
min
=
_mm512_min_epu8
(
min
,
_mm512_abs_epi8
(
zmm0
));
sgn
=
_mm512_xor_si512
(
sgn
,
zmm0
);
ymm0
=
p_cnProcBuf
[
lut_idxCnProcG6
[
j
][
k
]
+
i
];
min
=
_mm256_min_epu8
(
min
,
_mm256_abs_epi8
(
ymm0
));
sgn
=
_mm256_sign_epi8
(
sgn
,
ymm0
);
}
// Store result
min
=
_mm
512
_min_epu8
(
min
,
*
p_maxLLR
);
// 128 in epi8 is -127
*
p_cnProcBufResBit
=
conditional_negate
(
min
,
sgn
,
zeros
);
min
=
_mm
256
_min_epu8
(
min
,
*
p_maxLLR
);
// 128 in epi8 is -127
*
p_cnProcBufResBit
=
_mm256_sign_epi8
(
min
,
sgn
);
p_cnProcBufResBit
++
;
}
}
...
...
@@ -1003,18 +276,17 @@ static inline void nrLDPC_cnProc_BG1_AVX512(t_nrLDPC_lut* p_lut, t_nrLDPC_procBu
{
0
,
24
,
48
,
72
,
120
,
144
,
168
},
{
0
,
24
,
48
,
72
,
96
,
144
,
168
},
{
0
,
24
,
48
,
72
,
96
,
120
,
168
},
{
0
,
24
,
48
,
72
,
96
,
120
,
144
}};
if
(
lut_numCnInCnGroups
[
5
]
>
0
)
if
(
lut_numCnInCnGroups
[
4
]
>
0
)
{
// Number of groups of 32 CNs for parallel processing
// Ceil for values not divisible by 32
M
=
(
lut_numCnInCnGroups
[
5
]
*
Z
+
63
)
>>
6
;
M
=
(
lut_numCnInCnGroups
[
4
]
*
Z
+
31
)
>>
5
;
// Set the offset to each bit within a group in terms of 32 Byte
bitOffsetInGroup
=
(
lut_numCnInCnGroups_BG
1_R13
[
5
]
*
NR_LDPC_ZMAX
)
>>
6
;
bitOffsetInGroup
=
(
lut_numCnInCnGroups_BG
2_R15
[
4
]
*
NR_LDPC_ZMAX
)
>>
5
;
// Set pointers to start of group 8
p_cnProcBuf
=
(
__m
512i
*
)
&
cnProcBuf
[
lut_startAddrCnGroups
[
5
]];
p_cnProcBufRes
=
(
__m
512i
*
)
&
cnProcBufRes
[
lut_startAddrCnGroups
[
5
]];
p_cnProcBuf
=
(
__m
256i
*
)
&
cnProcBuf
[
lut_startAddrCnGroups
[
4
]];
p_cnProcBufRes
=
(
__m
256i
*
)
&
cnProcBufRes
[
lut_startAddrCnGroups
[
4
]];
// Loop over every BN
for
(
j
=
0
;
j
<
8
;
j
++
)
...
...
@@ -1026,74 +298,21 @@ static inline void nrLDPC_cnProc_BG1_AVX512(t_nrLDPC_lut* p_lut, t_nrLDPC_procBu
for
(
i
=
0
;
i
<
M
;
i
++
)
{
// Abs and sign of 32 CNs (first BN)
zmm0
=
p_cnProcBuf
[(
lut_idxCnProcG8
[
j
][
0
]
/
2
)
+
i
];
sgn
=
_mm
512_xor_si512
(
*
p_ones
,
z
mm0
);
min
=
_mm
512_abs_epi8
(
z
mm0
);
ymm0
=
p_cnProcBuf
[
lut_idxCnProcG8
[
j
][
0
]
+
i
];
sgn
=
_mm
256_sign_epi8
(
*
p_ones
,
y
mm0
);
min
=
_mm
256_abs_epi8
(
y
mm0
);
// Loop over BNs
for
(
k
=
1
;
k
<
7
;
k
++
)
{
zmm0
=
p_cnProcBuf
[(
lut_idxCnProcG8
[
j
][
k
]
/
2
)
+
i
];
min
=
_mm512_min_epu8
(
min
,
_mm512_abs_epi8
(
zmm0
));
sgn
=
_mm512_xor_si512
(
sgn
,
zmm0
);
}
// Store result
min
=
_mm512_min_epu8
(
min
,
*
p_maxLLR
);
// 128 in epi8 is -127
*
p_cnProcBufResBit
=
conditional_negate
(
min
,
sgn
,
zeros
);
p_cnProcBufResBit
++
;
}
}
}
// =====================================================================
// Process group with 9 BNs
// Offset is 2*384/32 = 24
const
uint8_t
lut_idxCnProcG9
[
9
][
8
]
=
{{
24
,
48
,
72
,
96
,
120
,
144
,
168
,
192
},
{
0
,
48
,
72
,
96
,
120
,
144
,
168
,
192
},
{
0
,
24
,
72
,
96
,
120
,
144
,
168
,
192
},
{
0
,
24
,
48
,
96
,
120
,
144
,
168
,
192
},
{
0
,
24
,
48
,
72
,
120
,
144
,
168
,
192
},
{
0
,
24
,
48
,
72
,
96
,
144
,
168
,
192
},
{
0
,
24
,
48
,
72
,
96
,
120
,
168
,
192
},
{
0
,
24
,
48
,
72
,
96
,
120
,
144
,
192
},
{
0
,
24
,
48
,
72
,
96
,
120
,
144
,
168
}};
if
(
lut_numCnInCnGroups
[
6
]
>
0
)
{
// Number of groups of 32 CNs for parallel processing
// Ceil for values not divisible by 32
M
=
(
lut_numCnInCnGroups
[
6
]
*
Z
+
63
)
>>
6
;
// Set the offset to each bit within a group in terms of 32 Byte
bitOffsetInGroup
=
(
lut_numCnInCnGroups_BG1_R13
[
6
]
*
NR_LDPC_ZMAX
)
>>
6
;
// Set pointers to start of group 9
p_cnProcBuf
=
(
__m512i
*
)
&
cnProcBuf
[
lut_startAddrCnGroups
[
6
]];
p_cnProcBufRes
=
(
__m512i
*
)
&
cnProcBufRes
[
lut_startAddrCnGroups
[
6
]];
// Loop over every BN
for
(
j
=
0
;
j
<
9
;
j
++
)
{
// Set of results pointer to correct BN address
p_cnProcBufResBit
=
p_cnProcBufRes
+
(
j
*
bitOffsetInGroup
);
// Loop over CNs
for
(
i
=
0
;
i
<
M
;
i
++
)
{
// Abs and sign of 32 CNs (first BN)
zmm0
=
p_cnProcBuf
[(
lut_idxCnProcG9
[
j
][
0
]
/
2
)
+
i
];
sgn
=
_mm512_xor_si512
(
*
p_ones
,
zmm0
);
min
=
_mm512_abs_epi8
(
zmm0
);
// Loop over BNs
for
(
k
=
1
;
k
<
8
;
k
++
)
{
zmm0
=
p_cnProcBuf
[(
lut_idxCnProcG9
[
j
][
k
]
/
2
)
+
i
];
min
=
_mm512_min_epu8
(
min
,
_mm512_abs_epi8
(
zmm0
));
sgn
=
_mm512_xor_si512
(
sgn
,
zmm0
);
ymm0
=
p_cnProcBuf
[
lut_idxCnProcG8
[
j
][
k
]
+
i
];
min
=
_mm256_min_epu8
(
min
,
_mm256_abs_epi8
(
ymm0
));
sgn
=
_mm256_sign_epi8
(
sgn
,
ymm0
);
}
// Store result
min
=
_mm
512
_min_epu8
(
min
,
*
p_maxLLR
);
// 128 in epi8 is -127
*
p_cnProcBufResBit
=
conditional_negate
(
min
,
sgn
,
zeros
);
min
=
_mm
256
_min_epu8
(
min
,
*
p_maxLLR
);
// 128 in epi8 is -127
*
p_cnProcBufResBit
=
_mm256_sign_epi8
(
min
,
sgn
);
p_cnProcBufResBit
++
;
}
}
...
...
@@ -1102,25 +321,24 @@ static inline void nrLDPC_cnProc_BG1_AVX512(t_nrLDPC_lut* p_lut, t_nrLDPC_procBu
// =====================================================================
// Process group with 10 BNs
// Offset is
1*384/32 = 12
const
uint8_t
lut_idxCnProcG10
[
10
][
9
]
=
{{
12
,
24
,
36
,
48
,
60
,
72
,
84
,
96
,
108
},
{
0
,
24
,
36
,
48
,
60
,
72
,
84
,
96
,
108
},
{
0
,
12
,
36
,
48
,
60
,
72
,
84
,
96
,
108
},
{
0
,
12
,
24
,
48
,
60
,
72
,
84
,
96
,
108
},
{
0
,
12
,
24
,
36
,
60
,
72
,
84
,
96
,
108
},
{
0
,
12
,
24
,
36
,
48
,
72
,
84
,
96
,
108
},
{
0
,
12
,
24
,
36
,
48
,
60
,
84
,
96
,
108
},
{
0
,
12
,
24
,
36
,
48
,
60
,
72
,
96
,
108
},
{
0
,
12
,
24
,
36
,
48
,
60
,
72
,
84
,
108
},
{
0
,
12
,
24
,
36
,
48
,
60
,
72
,
84
,
96
}};
// Offset is
2*384/32 = 24
const
uint8_t
lut_idxCnProcG10
[
10
][
9
]
=
{{
24
,
48
,
72
,
96
,
120
,
144
,
168
,
192
,
216
},
{
0
,
48
,
72
,
96
,
120
,
144
,
168
,
192
,
216
},
{
0
,
24
,
72
,
96
,
120
,
144
,
168
,
192
,
216
},
{
0
,
24
,
48
,
96
,
120
,
144
,
168
,
192
,
216
},
{
0
,
24
,
48
,
72
,
120
,
144
,
168
,
192
,
216
},
{
0
,
24
,
48
,
72
,
96
,
144
,
168
,
192
,
216
},
{
0
,
24
,
48
,
72
,
96
,
120
,
168
,
192
,
216
},
{
0
,
24
,
48
,
72
,
96
,
120
,
144
,
192
,
216
},
{
0
,
24
,
48
,
72
,
96
,
120
,
144
,
168
,
216
},
{
0
,
24
,
48
,
72
,
96
,
120
,
144
,
168
,
192
}};
if
(
lut_numCnInCnGroups
[
7
]
>
0
)
if
(
lut_numCnInCnGroups
[
5
]
>
0
)
{
// Number of groups of 32 CNs for parallel processing
// Ceil for values not divisible by 32
M
=
(
lut_numCnInCnGroups
[
7
]
*
Z
+
63
)
>>
6
;
M
=
(
lut_numCnInCnGroups
[
5
]
*
Z
+
31
)
>>
5
;
// Set the offset to each bit within a group in terms of 32 Byte
bitOffsetInGroup
=
(
lut_numCnInCnGroups_BG
1_R13
[
7
]
*
NR_LDPC_ZMAX
)
>>
6
;
bitOffsetInGroup
=
(
lut_numCnInCnGroups_BG
2_R15
[
5
]
*
NR_LDPC_ZMAX
)
>>
5
;
// Set pointers to start of group 10
p_cnProcBuf
=
(
__m
512i
*
)
&
cnProcBuf
[
lut_startAddrCnGroups
[
7
]];
p_cnProcBufRes
=
(
__m
512i
*
)
&
cnProcBufRes
[
lut_startAddrCnGroups
[
7
]];
p_cnProcBuf
=
(
__m
256i
*
)
&
cnProcBuf
[
lut_startAddrCnGroups
[
5
]];
p_cnProcBufRes
=
(
__m
256i
*
)
&
cnProcBufRes
[
lut_startAddrCnGroups
[
5
]];
// Loop over every BN
for
(
j
=
0
;
j
<
10
;
j
++
)
...
...
@@ -1132,87 +350,37 @@ static inline void nrLDPC_cnProc_BG1_AVX512(t_nrLDPC_lut* p_lut, t_nrLDPC_procBu
for
(
i
=
0
;
i
<
M
;
i
++
)
{
// Abs and sign of 32 CNs (first BN)
zmm0
=
p_cnProcBuf
[(
lut_idxCnProcG10
[
j
][
0
]
/
2
)
+
i
];
sgn
=
_mm
512_xor_si512
(
*
p_ones
,
z
mm0
);
min
=
_mm
512_abs_epi8
(
z
mm0
);
ymm0
=
p_cnProcBuf
[
lut_idxCnProcG10
[
j
][
0
]
+
i
];
sgn
=
_mm
256_sign_epi8
(
*
p_ones
,
y
mm0
);
min
=
_mm
256_abs_epi8
(
y
mm0
);
// Loop over BNs
for
(
k
=
1
;
k
<
9
;
k
++
)
{
zmm0
=
p_cnProcBuf
[(
lut_idxCnProcG10
[
j
][
k
]
/
2
)
+
i
];
min
=
_mm
512_min_epu8
(
min
,
_mm512_abs_epi8
(
z
mm0
));
sgn
=
_mm512_xor_si512
(
sgn
,
z
mm0
);
ymm0
=
p_cnProcBuf
[
lut_idxCnProcG10
[
j
][
k
]
+
i
];
min
=
_mm
256_min_epu8
(
min
,
_mm256_abs_epi8
(
y
mm0
));
sgn
=
_mm256_sign_epi8
(
sgn
,
y
mm0
);
}
// Store result
min
=
_mm
512
_min_epu8
(
min
,
*
p_maxLLR
);
// 128 in epi8 is -127
*
p_cnProcBufResBit
=
conditional_negate
(
min
,
sgn
,
zeros
);
min
=
_mm
256
_min_epu8
(
min
,
*
p_maxLLR
);
// 128 in epi8 is -127
*
p_cnProcBufResBit
=
_mm256_sign_epi8
(
min
,
sgn
);
p_cnProcBufResBit
++
;
}
}
}
// =====================================================================
// Process group with 19 BNs
// Offset is 4*384/32 = 12
const
uint16_t
lut_idxCnProcG19
[
19
][
18
]
=
{{
48
,
96
,
144
,
192
,
240
,
288
,
336
,
384
,
432
,
480
,
528
,
576
,
624
,
672
,
720
,
768
,
816
,
864
},
{
0
,
96
,
144
,
192
,
240
,
288
,
336
,
384
,
432
,
480
,
528
,
576
,
624
,
672
,
720
,
768
,
816
,
864
},
{
0
,
48
,
144
,
192
,
240
,
288
,
336
,
384
,
432
,
480
,
528
,
576
,
624
,
672
,
720
,
768
,
816
,
864
},
{
0
,
48
,
96
,
192
,
240
,
288
,
336
,
384
,
432
,
480
,
528
,
576
,
624
,
672
,
720
,
768
,
816
,
864
},
{
0
,
48
,
96
,
144
,
240
,
288
,
336
,
384
,
432
,
480
,
528
,
576
,
624
,
672
,
720
,
768
,
816
,
864
},
{
0
,
48
,
96
,
144
,
192
,
288
,
336
,
384
,
432
,
480
,
528
,
576
,
624
,
672
,
720
,
768
,
816
,
864
},
{
0
,
48
,
96
,
144
,
192
,
240
,
336
,
384
,
432
,
480
,
528
,
576
,
624
,
672
,
720
,
768
,
816
,
864
},
{
0
,
48
,
96
,
144
,
192
,
240
,
288
,
384
,
432
,
480
,
528
,
576
,
624
,
672
,
720
,
768
,
816
,
864
},
{
0
,
48
,
96
,
144
,
192
,
240
,
288
,
336
,
432
,
480
,
528
,
576
,
624
,
672
,
720
,
768
,
816
,
864
},
{
0
,
48
,
96
,
144
,
192
,
240
,
288
,
336
,
384
,
480
,
528
,
576
,
624
,
672
,
720
,
768
,
816
,
864
},
{
0
,
48
,
96
,
144
,
192
,
240
,
288
,
336
,
384
,
432
,
528
,
576
,
624
,
672
,
720
,
768
,
816
,
864
},
{
0
,
48
,
96
,
144
,
192
,
240
,
288
,
336
,
384
,
432
,
480
,
576
,
624
,
672
,
720
,
768
,
816
,
864
},
{
0
,
48
,
96
,
144
,
192
,
240
,
288
,
336
,
384
,
432
,
480
,
528
,
624
,
672
,
720
,
768
,
816
,
864
},
{
0
,
48
,
96
,
144
,
192
,
240
,
288
,
336
,
384
,
432
,
480
,
528
,
576
,
672
,
720
,
768
,
816
,
864
},
{
0
,
48
,
96
,
144
,
192
,
240
,
288
,
336
,
384
,
432
,
480
,
528
,
576
,
624
,
720
,
768
,
816
,
864
},
{
0
,
48
,
96
,
144
,
192
,
240
,
288
,
336
,
384
,
432
,
480
,
528
,
576
,
624
,
672
,
768
,
816
,
864
},
{
0
,
48
,
96
,
144
,
192
,
240
,
288
,
336
,
384
,
432
,
480
,
528
,
576
,
624
,
672
,
720
,
816
,
864
},
{
0
,
48
,
96
,
144
,
192
,
240
,
288
,
336
,
384
,
432
,
480
,
528
,
576
,
624
,
672
,
720
,
768
,
864
},
{
0
,
48
,
96
,
144
,
192
,
240
,
288
,
336
,
384
,
432
,
480
,
528
,
576
,
624
,
672
,
720
,
768
,
816
}};
if
(
lut_numCnInCnGroups
[
8
]
>
0
)
{
// Number of groups of 32 CNs for parallel processing
// Ceil for values not divisible by 32
M
=
(
lut_numCnInCnGroups
[
8
]
*
Z
+
63
)
>>
6
;
// Set the offset to each bit within a group in terms of 32 Byte
bitOffsetInGroup
=
(
lut_numCnInCnGroups_BG1_R13
[
8
]
*
NR_LDPC_ZMAX
)
>>
6
;
// Set pointers to start of group 19
p_cnProcBuf
=
(
__m512i
*
)
&
cnProcBuf
[
lut_startAddrCnGroups
[
8
]];
p_cnProcBufRes
=
(
__m512i
*
)
&
cnProcBufRes
[
lut_startAddrCnGroups
[
8
]];
// Loop over every BN
for
(
j
=
0
;
j
<
19
;
j
++
)
{
// Set of results pointer to correct BN address
p_cnProcBufResBit
=
p_cnProcBufRes
+
(
j
*
bitOffsetInGroup
);
}
// Loop over CNs
for
(
i
=
0
;
i
<
M
;
i
++
)
{
// Abs and sign of 32 CNs (first BN)
zmm0
=
p_cnProcBuf
[(
lut_idxCnProcG19
[
j
][
0
]
/
2
)
+
i
];
sgn
=
_mm512_xor_si512
(
*
p_ones
,
zmm0
);
min
=
_mm512_abs_epi8
(
zmm0
);
// Loop over BNs
for
(
k
=
1
;
k
<
18
;
k
++
)
{
zmm0
=
p_cnProcBuf
[(
lut_idxCnProcG19
[
j
][
k
]
/
2
)
+
i
];
min
=
_mm512_min_epu8
(
min
,
_mm512_abs_epi8
(
zmm0
));
sgn
=
_mm512_xor_si512
(
sgn
,
zmm0
);
}
/**
\brief Performs CN processing for BG1 on the CN processing buffer and stores the results in the CN processing results buffer.
\param p_lut Pointer to decoder LUTs
\param Z Lifting size
*/
// Store result
min
=
_mm512_min_epu8
(
min
,
*
p_maxLLR
);
// 128 in epi8 is -127
*
p_cnProcBufResBit
=
conditional_negate
(
min
,
sgn
,
zeros
);
p_cnProcBufResBit
++
;
}
}
}
}
#else
/**
\brief Performs CN processing for BG1 on the CN processing buffer and stores the results in the CN processing results buffer.
\param p_lut Pointer to decoder LUTs
...
...
@@ -1708,6 +876,7 @@ static inline void nrLDPC_cnProc_BG1(t_nrLDPC_lut* p_lut, int8_t* cnProcBuf, int
}
}
#endif
/**
\brief Performs parity check for BG1 on the CN processing buffer. Stops as soon as error is detected.
...
...
openair1/PHY/CODING/nrLDPC_
cnProc
.h
→
openair1/PHY/CODING/nrLDPC_
decoder/nrLDPC_cnProc_avx512
.h
View file @
f6bb869c
/*
* Licensed to the OpenAirInterface (OAI) Software Alliance under one or more
* contributor license agreements. See the NOTICE file distributed with
...
...
@@ -19,10 +20,10 @@
* contact@openairinterface.org
*/
/*!\file nrLDPC_cnProc.h
/*!\file nrLDPC_cnProc
_avx512
.h
* \brief Defines the functions for check node processing
* \author Sebastian Wagner (TCL Communications) Email: <mailto:sebastian.wagner@tcl.com>
* \date 30-09-20
19
* \date 30-09-20
21
* \version 1.0
* \note
* \warning
...
...
@@ -31,22 +32,14 @@
#ifndef __NR_LDPC_CNPROC__H__
#define __NR_LDPC_CNPROC__H__
/**
\brief Performs CN processing for BG2 on the CN processing buffer and stores the results in the CN processing results buffer.
\param p_lut Pointer to decoder LUTs
\param p_procBuf Pointer to processing buffers
\param Z Lifting size
*/
static
inline
void
nrLDPC_cnProc_BG2
(
t_nrLDPC_lut
*
p_lut
,
t_nrLDPC_procBuf
*
p_procBuf
,
uint16_t
Z
)
#define conditional_negate(a,b,z) _mm512_mask_sub_epi8(a,_mm512_movepi8_mask(b),z,a)
static
inline
void
nrLDPC_cnProc_BG2_AVX512
(
t_nrLDPC_lut
*
p_lut
,
int8_t
*
cnProcBuf
,
int8_t
*
cnProcBufRes
,
uint16_t
Z
)
{
const
uint8_t
*
lut_numCnInCnGroups
=
p_lut
->
numCnInCnGroups
;
const
uint32_t
*
lut_startAddrCnGroups
=
p_lut
->
startAddrCnGroups
;
int8_t
*
cnProcBuf
=
p_procBuf
->
cnProcBuf
;
int8_t
*
cnProcBufRes
=
p_procBuf
->
cnProcBufRes
;
__m256i
*
p_cnProcBuf
;
__m256i
*
p_cnProcBufRes
;
__m512i
*
p_cnProcBuf
;
__m512i
*
p_cnProcBufRes
;
// Number of CNs in Groups
uint32_t
M
;
...
...
@@ -56,11 +49,13 @@ static inline void nrLDPC_cnProc_BG2(t_nrLDPC_lut* p_lut, t_nrLDPC_procBuf* p_pr
// Offset to each bit within a group in terms of 32 Byte
uint32_t
bitOffsetInGroup
;
__m256i
ymm0
,
min
,
sgn
;
__m256i
*
p_cnProcBufResBit
;
__m512i
zmm0
,
min
,
sgn
,
zeros
;
zeros
=
_mm512_setzero_si512
();
// maxLLR = _mm512_set1_epi8((char)127);
__m512i
*
p_cnProcBufResBit
;
const
__m
256i
*
p_ones
=
(
__m256i
*
)
ones256
_epi8
;
const
__m
256i
*
p_maxLLR
=
(
__m256i
*
)
maxLLR256
_epi8
;
const
__m
512i
*
p_ones
=
(
__m512i
*
)
ones512
_epi8
;
const
__m
512i
*
p_maxLLR
=
(
__m512i
*
)
maxLLR512
_epi8
;
// LUT with offsets for bits that need to be processed
// 1. bit proc requires LLRs of 2. and 3. bit, 2.bits of 1. and 3. etc.
...
...
@@ -74,13 +69,13 @@ static inline void nrLDPC_cnProc_BG2(t_nrLDPC_lut* p_lut, t_nrLDPC_procBuf* p_pr
{
// Number of groups of 32 CNs for parallel processing
// Ceil for values not divisible by 32
M
=
(
lut_numCnInCnGroups
[
0
]
*
Z
+
31
)
>>
5
;
M
=
(
lut_numCnInCnGroups
[
0
]
*
Z
+
63
)
>>
6
;
// Set the offset to each bit within a group in terms of 32 Byte
bitOffsetInGroup
=
(
lut_numCnInCnGroups_BG2_R15
[
0
]
*
NR_LDPC_ZMAX
)
>>
5
;
bitOffsetInGroup
=
(
lut_numCnInCnGroups_BG2_R15
[
0
]
*
NR_LDPC_ZMAX
)
>>
6
;
// Set pointers to start of group 3
p_cnProcBuf
=
(
__m
256
i
*
)
&
cnProcBuf
[
lut_startAddrCnGroups
[
0
]];
p_cnProcBufRes
=
(
__m
256
i
*
)
&
cnProcBufRes
[
lut_startAddrCnGroups
[
0
]];
p_cnProcBuf
=
(
__m
512
i
*
)
&
cnProcBuf
[
lut_startAddrCnGroups
[
0
]];
p_cnProcBufRes
=
(
__m
512
i
*
)
&
cnProcBufRes
[
lut_startAddrCnGroups
[
0
]];
// Loop over every BN
for
(
j
=
0
;
j
<
3
;
j
++
)
...
...
@@ -88,29 +83,29 @@ static inline void nrLDPC_cnProc_BG2(t_nrLDPC_lut* p_lut, t_nrLDPC_procBuf* p_pr
// Set of results pointer to correct BN address
p_cnProcBufResBit
=
p_cnProcBufRes
+
(
j
*
bitOffsetInGroup
);
__m
256i
*
pj0
=
&
p_cnProcBuf
[
lut_idxCnProcG3
[
j
][
0
]
];
__m
256i
*
pj1
=
&
p_cnProcBuf
[
lut_idxCnProcG3
[
j
][
1
]
];
__m
512i
*
pj0
=
&
p_cnProcBuf
[(
lut_idxCnProcG3
[
j
][
0
]
/
2
)
];
__m
512i
*
pj1
=
&
p_cnProcBuf
[(
lut_idxCnProcG3
[
j
][
1
]
/
2
)
];
// Loop over CNs
for
(
i
=
0
;
i
<
M
;
i
++
)
{
// Abs and sign of 32 CNs (first BN)
// y
mm0 = p_cnProcBuf[lut_idxCnProcG3[j][0] + i];
y
mm0
=
pj0
[
i
];
sgn
=
_mm
256_sign_epi8
(
*
p_ones
,
y
mm0
);
min
=
_mm
256_abs_epi8
(
y
mm0
);
// z
mm0 = p_cnProcBuf[lut_idxCnProcG3[j][0] + i];
z
mm0
=
pj0
[
i
];
sgn
=
_mm
512_xor_si512
(
*
p_ones
,
z
mm0
);
min
=
_mm
512_abs_epi8
(
z
mm0
);
// 32 CNs of second BN
// ymm0 = p_cnProcBuf[lut_idxCnProcG3[j][1]
+ i];
y
mm0
=
pj1
[
i
];
min
=
_mm
256_min_epu8
(
min
,
_mm256_abs_epi8
(
y
mm0
));
sgn
=
_mm
256_sign_epi8
(
sgn
,
y
mm0
);
// zmm0 = p_cnProcBuf[(lut_idxCnProcG3[j][1]/2)
+ i];
z
mm0
=
pj1
[
i
];
min
=
_mm
512_min_epu8
(
min
,
_mm512_abs_epi8
(
z
mm0
));
sgn
=
_mm
512_xor_si512
(
sgn
,
z
mm0
);
// Store result
min
=
_mm
256
_min_epu8
(
min
,
*
p_maxLLR
);
// 128 in epi8 is -127
*
p_cnProcBufResBit
=
_mm256_sign_epi8
(
min
,
sgn
);
min
=
_mm
512
_min_epu8
(
min
,
*
p_maxLLR
);
// 128 in epi8 is -127
*
p_cnProcBufResBit
=
conditional_negate
(
min
,
sgn
,
zeros
);
p_cnProcBufResBit
++
;
//p_cnProcBufResBit[i]=_mm256
_sign_epi8(min, sgn);
//p_cnProcBufResBit[i]=_mm512
_sign_epi8(min, sgn);
}
}
}
...
...
@@ -125,13 +120,13 @@ static inline void nrLDPC_cnProc_BG2(t_nrLDPC_lut* p_lut, t_nrLDPC_procBuf* p_pr
{
// Number of groups of 32 CNs for parallel processing
// Ceil for values not divisible by 32
M
=
(
lut_numCnInCnGroups
[
1
]
*
Z
+
31
)
>>
5
;
M
=
(
lut_numCnInCnGroups
[
1
]
*
Z
+
63
)
>>
6
;
// Set the offset to each bit within a group in terms of 32 Byte
bitOffsetInGroup
=
(
lut_numCnInCnGroups_BG2_R15
[
1
]
*
NR_LDPC_ZMAX
)
>>
5
;
bitOffsetInGroup
=
(
lut_numCnInCnGroups_BG2_R15
[
1
]
*
NR_LDPC_ZMAX
)
>>
6
;
// Set pointers to start of group 4
p_cnProcBuf
=
(
__m
256
i
*
)
&
cnProcBuf
[
lut_startAddrCnGroups
[
1
]];
p_cnProcBufRes
=
(
__m
256
i
*
)
&
cnProcBufRes
[
lut_startAddrCnGroups
[
1
]];
p_cnProcBuf
=
(
__m
512
i
*
)
&
cnProcBuf
[
lut_startAddrCnGroups
[
1
]];
p_cnProcBufRes
=
(
__m
512
i
*
)
&
cnProcBufRes
[
lut_startAddrCnGroups
[
1
]];
// Loop over every BN
for
(
j
=
0
;
j
<
4
;
j
++
)
...
...
@@ -143,21 +138,21 @@ static inline void nrLDPC_cnProc_BG2(t_nrLDPC_lut* p_lut, t_nrLDPC_procBuf* p_pr
for
(
i
=
0
;
i
<
M
;
i
++
)
{
// Abs and sign of 32 CNs (first BN)
ymm0
=
p_cnProcBuf
[
lut_idxCnProcG4
[
j
][
0
]
+
i
];
sgn
=
_mm
256_sign_epi8
(
*
p_ones
,
y
mm0
);
min
=
_mm
256_abs_epi8
(
y
mm0
);
zmm0
=
p_cnProcBuf
[(
lut_idxCnProcG4
[
j
][
0
]
/
2
)
+
i
];
sgn
=
_mm
512_xor_si512
(
*
p_ones
,
z
mm0
);
min
=
_mm
512_abs_epi8
(
z
mm0
);
// Loop over BNs
for
(
k
=
1
;
k
<
3
;
k
++
)
{
ymm0
=
p_cnProcBuf
[
lut_idxCnProcG4
[
j
][
k
]
+
i
];
min
=
_mm
256_min_epu8
(
min
,
_mm256_abs_epi8
(
y
mm0
));
sgn
=
_mm256_sign_epi8
(
sgn
,
y
mm0
);
zmm0
=
p_cnProcBuf
[(
lut_idxCnProcG4
[
j
][
k
]
/
2
)
+
i
];
min
=
_mm
512_min_epu8
(
min
,
_mm512_abs_epi8
(
z
mm0
));
sgn
=
_mm512_xor_si512
(
sgn
,
z
mm0
);
}
// Store result
min
=
_mm
256
_min_epu8
(
min
,
*
p_maxLLR
);
// 128 in epi8 is -127
*
p_cnProcBufResBit
=
_mm256_sign_epi8
(
min
,
sgn
);
min
=
_mm
512
_min_epu8
(
min
,
*
p_maxLLR
);
// 128 in epi8 is -127
*
p_cnProcBufResBit
=
conditional_negate
(
min
,
sgn
,
zeros
);
p_cnProcBufResBit
++
;
}
}
...
...
@@ -174,13 +169,13 @@ static inline void nrLDPC_cnProc_BG2(t_nrLDPC_lut* p_lut, t_nrLDPC_procBuf* p_pr
{
// Number of groups of 32 CNs for parallel processing
// Ceil for values not divisible by 32
M
=
(
lut_numCnInCnGroups
[
2
]
*
Z
+
31
)
>>
5
;
M
=
(
lut_numCnInCnGroups
[
2
]
*
Z
+
63
)
>>
6
;
// Set the offset to each bit within a group in terms of 32 Byte
bitOffsetInGroup
=
(
lut_numCnInCnGroups_BG2_R15
[
2
]
*
NR_LDPC_ZMAX
)
>>
5
;
bitOffsetInGroup
=
(
lut_numCnInCnGroups_BG2_R15
[
2
]
*
NR_LDPC_ZMAX
)
>>
6
;
// Set pointers to start of group 5
p_cnProcBuf
=
(
__m
256
i
*
)
&
cnProcBuf
[
lut_startAddrCnGroups
[
2
]];
p_cnProcBufRes
=
(
__m
256
i
*
)
&
cnProcBufRes
[
lut_startAddrCnGroups
[
2
]];
p_cnProcBuf
=
(
__m
512
i
*
)
&
cnProcBuf
[
lut_startAddrCnGroups
[
2
]];
p_cnProcBufRes
=
(
__m
512
i
*
)
&
cnProcBufRes
[
lut_startAddrCnGroups
[
2
]];
// Loop over every BN
for
(
j
=
0
;
j
<
5
;
j
++
)
...
...
@@ -192,21 +187,21 @@ static inline void nrLDPC_cnProc_BG2(t_nrLDPC_lut* p_lut, t_nrLDPC_procBuf* p_pr
for
(
i
=
0
;
i
<
M
;
i
++
)
{
// Abs and sign of 32 CNs (first BN)
ymm0
=
p_cnProcBuf
[
lut_idxCnProcG5
[
j
][
0
]
+
i
];
sgn
=
_mm
256_sign_epi8
(
*
p_ones
,
y
mm0
);
min
=
_mm
256_abs_epi8
(
y
mm0
);
zmm0
=
p_cnProcBuf
[(
lut_idxCnProcG5
[
j
][
0
]
/
2
)
+
i
];
sgn
=
_mm
512_xor_si512
(
*
p_ones
,
z
mm0
);
min
=
_mm
512_abs_epi8
(
z
mm0
);
// Loop over BNs
for
(
k
=
1
;
k
<
4
;
k
++
)
{
ymm0
=
p_cnProcBuf
[
lut_idxCnProcG5
[
j
][
k
]
+
i
];
min
=
_mm
256_min_epu8
(
min
,
_mm256_abs_epi8
(
y
mm0
));
sgn
=
_mm256_sign_epi8
(
sgn
,
y
mm0
);
zmm0
=
p_cnProcBuf
[(
lut_idxCnProcG5
[
j
][
k
]
/
2
)
+
i
];
min
=
_mm
512_min_epu8
(
min
,
_mm512_abs_epi8
(
z
mm0
));
sgn
=
_mm512_xor_si512
(
sgn
,
z
mm0
);
}
// Store result
min
=
_mm
256
_min_epu8
(
min
,
*
p_maxLLR
);
// 128 in epi8 is -127
*
p_cnProcBufResBit
=
_mm256_sign_epi8
(
min
,
sgn
);
min
=
_mm
512
_min_epu8
(
min
,
*
p_maxLLR
);
// 128 in epi8 is -127
*
p_cnProcBufResBit
=
conditional_negate
(
min
,
sgn
,
zeros
);
p_cnProcBufResBit
++
;
}
}
...
...
@@ -224,13 +219,13 @@ static inline void nrLDPC_cnProc_BG2(t_nrLDPC_lut* p_lut, t_nrLDPC_procBuf* p_pr
{
// Number of groups of 32 CNs for parallel processing
// Ceil for values not divisible by 32
M
=
(
lut_numCnInCnGroups
[
3
]
*
Z
+
31
)
>>
5
;
M
=
(
lut_numCnInCnGroups
[
3
]
*
Z
+
63
)
>>
6
;
// Set the offset to each bit within a group in terms of 32 Byte
bitOffsetInGroup
=
(
lut_numCnInCnGroups_BG2_R15
[
3
]
*
NR_LDPC_ZMAX
)
>>
5
;
bitOffsetInGroup
=
(
lut_numCnInCnGroups_BG2_R15
[
3
]
*
NR_LDPC_ZMAX
)
>>
6
;
// Set pointers to start of group 6
p_cnProcBuf
=
(
__m
256
i
*
)
&
cnProcBuf
[
lut_startAddrCnGroups
[
3
]];
p_cnProcBufRes
=
(
__m
256
i
*
)
&
cnProcBufRes
[
lut_startAddrCnGroups
[
3
]];
p_cnProcBuf
=
(
__m
512
i
*
)
&
cnProcBuf
[
lut_startAddrCnGroups
[
3
]];
p_cnProcBufRes
=
(
__m
512
i
*
)
&
cnProcBufRes
[
lut_startAddrCnGroups
[
3
]];
// Loop over every BN
for
(
j
=
0
;
j
<
6
;
j
++
)
...
...
@@ -242,21 +237,21 @@ static inline void nrLDPC_cnProc_BG2(t_nrLDPC_lut* p_lut, t_nrLDPC_procBuf* p_pr
for
(
i
=
0
;
i
<
M
;
i
++
)
{
// Abs and sign of 32 CNs (first BN)
ymm0
=
p_cnProcBuf
[
lut_idxCnProcG6
[
j
][
0
]
+
i
];
sgn
=
_mm
256_sign_epi8
(
*
p_ones
,
y
mm0
);
min
=
_mm
256_abs_epi8
(
y
mm0
);
zmm0
=
p_cnProcBuf
[(
lut_idxCnProcG6
[
j
][
0
]
/
2
)
+
i
];
sgn
=
_mm
512_xor_si512
(
*
p_ones
,
z
mm0
);
min
=
_mm
512_abs_epi8
(
z
mm0
);
// Loop over BNs
for
(
k
=
1
;
k
<
5
;
k
++
)
{
ymm0
=
p_cnProcBuf
[
lut_idxCnProcG6
[
j
][
k
]
+
i
];
min
=
_mm
256_min_epu8
(
min
,
_mm256_abs_epi8
(
y
mm0
));
sgn
=
_mm256_sign_epi8
(
sgn
,
y
mm0
);
zmm0
=
p_cnProcBuf
[(
lut_idxCnProcG6
[
j
][
k
]
/
2
)
+
i
];
min
=
_mm
512_min_epu8
(
min
,
_mm512_abs_epi8
(
z
mm0
));
sgn
=
_mm512_xor_si512
(
sgn
,
z
mm0
);
}
// Store result
min
=
_mm
256
_min_epu8
(
min
,
*
p_maxLLR
);
// 128 in epi8 is -127
*
p_cnProcBufResBit
=
_mm256_sign_epi8
(
min
,
sgn
);
min
=
_mm
512
_min_epu8
(
min
,
*
p_maxLLR
);
// 128 in epi8 is -127
*
p_cnProcBufResBit
=
conditional_negate
(
min
,
sgn
,
zeros
);
p_cnProcBufResBit
++
;
}
}
...
...
@@ -275,13 +270,13 @@ static inline void nrLDPC_cnProc_BG2(t_nrLDPC_lut* p_lut, t_nrLDPC_procBuf* p_pr
{
// Number of groups of 32 CNs for parallel processing
// Ceil for values not divisible by 32
M
=
(
lut_numCnInCnGroups
[
4
]
*
Z
+
31
)
>>
5
;
M
=
(
lut_numCnInCnGroups
[
4
]
*
Z
+
63
)
>>
6
;
// Set the offset to each bit within a group in terms of 32 Byte
bitOffsetInGroup
=
(
lut_numCnInCnGroups_BG2_R15
[
4
]
*
NR_LDPC_ZMAX
)
>>
5
;
bitOffsetInGroup
=
(
lut_numCnInCnGroups_BG2_R15
[
4
]
*
NR_LDPC_ZMAX
)
>>
6
;
// Set pointers to start of group 8
p_cnProcBuf
=
(
__m
256
i
*
)
&
cnProcBuf
[
lut_startAddrCnGroups
[
4
]];
p_cnProcBufRes
=
(
__m
256
i
*
)
&
cnProcBufRes
[
lut_startAddrCnGroups
[
4
]];
p_cnProcBuf
=
(
__m
512
i
*
)
&
cnProcBuf
[
lut_startAddrCnGroups
[
4
]];
p_cnProcBufRes
=
(
__m
512
i
*
)
&
cnProcBufRes
[
lut_startAddrCnGroups
[
4
]];
// Loop over every BN
for
(
j
=
0
;
j
<
8
;
j
++
)
...
...
@@ -293,21 +288,21 @@ static inline void nrLDPC_cnProc_BG2(t_nrLDPC_lut* p_lut, t_nrLDPC_procBuf* p_pr
for
(
i
=
0
;
i
<
M
;
i
++
)
{
// Abs and sign of 32 CNs (first BN)
ymm0
=
p_cnProcBuf
[
lut_idxCnProcG8
[
j
][
0
]
+
i
];
sgn
=
_mm
256_sign_epi8
(
*
p_ones
,
y
mm0
);
min
=
_mm
256_abs_epi8
(
y
mm0
);
zmm0
=
p_cnProcBuf
[(
lut_idxCnProcG8
[
j
][
0
]
/
2
)
+
i
];
sgn
=
_mm
512_xor_si512
(
*
p_ones
,
z
mm0
);
min
=
_mm
512_abs_epi8
(
z
mm0
);
// Loop over BNs
for
(
k
=
1
;
k
<
7
;
k
++
)
{
ymm0
=
p_cnProcBuf
[
lut_idxCnProcG8
[
j
][
k
]
+
i
];
min
=
_mm
256_min_epu8
(
min
,
_mm256_abs_epi8
(
y
mm0
));
sgn
=
_mm256_sign_epi8
(
sgn
,
y
mm0
);
zmm0
=
p_cnProcBuf
[(
lut_idxCnProcG8
[
j
][
k
]
/
2
)
+
i
];
min
=
_mm
512_min_epu8
(
min
,
_mm512_abs_epi8
(
z
mm0
));
sgn
=
_mm512_xor_si512
(
sgn
,
z
mm0
);
}
// Store result
min
=
_mm
256
_min_epu8
(
min
,
*
p_maxLLR
);
// 128 in epi8 is -127
*
p_cnProcBufResBit
=
_mm256_sign_epi8
(
min
,
sgn
);
min
=
_mm
512
_min_epu8
(
min
,
*
p_maxLLR
);
// 128 in epi8 is -127
*
p_cnProcBufResBit
=
conditional_negate
(
min
,
sgn
,
zeros
);
p_cnProcBufResBit
++
;
}
}
...
...
@@ -327,13 +322,13 @@ static inline void nrLDPC_cnProc_BG2(t_nrLDPC_lut* p_lut, t_nrLDPC_procBuf* p_pr
{
// Number of groups of 32 CNs for parallel processing
// Ceil for values not divisible by 32
M
=
(
lut_numCnInCnGroups
[
5
]
*
Z
+
31
)
>>
5
;
M
=
(
lut_numCnInCnGroups
[
5
]
*
Z
+
63
)
>>
6
;
// Set the offset to each bit within a group in terms of 32 Byte
bitOffsetInGroup
=
(
lut_numCnInCnGroups_BG2_R15
[
5
]
*
NR_LDPC_ZMAX
)
>>
5
;
bitOffsetInGroup
=
(
lut_numCnInCnGroups_BG2_R15
[
5
]
*
NR_LDPC_ZMAX
)
>>
6
;
// Set pointers to start of group 10
p_cnProcBuf
=
(
__m
256
i
*
)
&
cnProcBuf
[
lut_startAddrCnGroups
[
5
]];
p_cnProcBufRes
=
(
__m
256
i
*
)
&
cnProcBufRes
[
lut_startAddrCnGroups
[
5
]];
p_cnProcBuf
=
(
__m
512
i
*
)
&
cnProcBuf
[
lut_startAddrCnGroups
[
5
]];
p_cnProcBufRes
=
(
__m
512
i
*
)
&
cnProcBufRes
[
lut_startAddrCnGroups
[
5
]];
// Loop over every BN
for
(
j
=
0
;
j
<
10
;
j
++
)
...
...
@@ -345,21 +340,21 @@ static inline void nrLDPC_cnProc_BG2(t_nrLDPC_lut* p_lut, t_nrLDPC_procBuf* p_pr
for
(
i
=
0
;
i
<
M
;
i
++
)
{
// Abs and sign of 32 CNs (first BN)
ymm0
=
p_cnProcBuf
[
lut_idxCnProcG10
[
j
][
0
]
+
i
];
sgn
=
_mm
256_sign_epi8
(
*
p_ones
,
y
mm0
);
min
=
_mm
256_abs_epi8
(
y
mm0
);
zmm0
=
p_cnProcBuf
[(
lut_idxCnProcG10
[
j
][
0
]
/
2
)
+
i
];
sgn
=
_mm
512_xor_si512
(
*
p_ones
,
z
mm0
);
min
=
_mm
512_abs_epi8
(
z
mm0
);
// Loop over BNs
for
(
k
=
1
;
k
<
9
;
k
++
)
{
ymm0
=
p_cnProcBuf
[
lut_idxCnProcG10
[
j
][
k
]
+
i
];
min
=
_mm
256_min_epu8
(
min
,
_mm256_abs_epi8
(
y
mm0
));
sgn
=
_mm256_sign_epi8
(
sgn
,
y
mm0
);
zmm0
=
p_cnProcBuf
[(
lut_idxCnProcG10
[
j
][
k
]
/
2
)
+
i
];
min
=
_mm
512_min_epu8
(
min
,
_mm512_abs_epi8
(
z
mm0
));
sgn
=
_mm512_xor_si512
(
sgn
,
z
mm0
);
}
// Store result
min
=
_mm
256
_min_epu8
(
min
,
*
p_maxLLR
);
// 128 in epi8 is -127
*
p_cnProcBufResBit
=
_mm256_sign_epi8
(
min
,
sgn
);
min
=
_mm
512
_min_epu8
(
min
,
*
p_maxLLR
);
// 128 in epi8 is -127
*
p_cnProcBufResBit
=
conditional_negate
(
min
,
sgn
,
zeros
);
p_cnProcBufResBit
++
;
}
}
...
...
@@ -367,15 +362,6 @@ static inline void nrLDPC_cnProc_BG2(t_nrLDPC_lut* p_lut, t_nrLDPC_procBuf* p_pr
}
/**
\brief Performs CN processing for BG1 on the CN processing buffer and stores the results in the CN processing results buffer.
\param p_lut Pointer to decoder LUTs
\param Z Lifting size
*/
#ifdef __AVX512BW__
#define conditional_negate(a,b,z) _mm512_mask_sub_epi8(a,_mm512_movepi8_mask(b),z,a)
static
inline
void
nrLDPC_cnProc_BG1_AVX512
(
t_nrLDPC_lut
*
p_lut
,
t_nrLDPC_procBuf
*
p_procBuf
,
uint16_t
Z
)
{
const
uint8_t
*
lut_numCnInCnGroups
=
p_lut
->
numCnInCnGroups
;
...
...
@@ -383,7 +369,7 @@ static inline void nrLDPC_cnProc_BG1_AVX512(t_nrLDPC_lut* p_lut, t_nrLDPC_procBu
int8_t
*
cnProcBuf
=
p_procBuf
->
cnProcBuf
;
int8_t
*
cnProcBufRes
=
p_procBuf
->
cnProcBufRes
;
__m512i
*
p_cnProcBuf
;
__m512i
*
p_cnProcBufRes
;
...
...
@@ -395,16 +381,16 @@ static inline void nrLDPC_cnProc_BG1_AVX512(t_nrLDPC_lut* p_lut, t_nrLDPC_procBu
// Offset to each bit within a group in terms of 32 Byte
uint32_t
bitOffsetInGroup
;
__m512i
zmm0
,
min
,
sgn
,
zeros
,
maxLLR
;
__m512i
zmm0
,
min
,
sgn
,
zeros
;
zeros
=
_mm512_setzero_si512
();
maxLLR
=
_mm512_set1_epi8
((
char
)
127
);
//
maxLLR = _mm512_set1_epi8((char)127);
__m512i
*
p_cnProcBufResBit
;
const
__m512i
*
p_ones
=
(
__m512i
*
)
ones512_epi8
;
const
__m512i
*
p_maxLLR
=
(
__m512i
*
)
maxLLR512_epi8
;
...
...
@@ -440,14 +426,14 @@ static inline void nrLDPC_cnProc_BG1_AVX512(t_nrLDPC_lut* p_lut, t_nrLDPC_procBu
{
// Abs and sign of 32 CNs (first BN)
zmm0
=
p_cnProcBuf
[(
lut_idxCnProcG3
[
j
][
0
]
/
2
)
+
i
];
sgn
=
_mm512_xor_si512
(
*
p_ones
,
zmm0
);
sgn
=
_mm512_xor_si512
(
*
p_ones
,
zmm0
);
min
=
_mm512_abs_epi8
(
zmm0
);
// 32 CNs of second BN
zmm0
=
p_cnProcBuf
[(
lut_idxCnProcG3
[
j
][
1
]
/
2
)
+
i
];
min
=
_mm512_min_epu8
(
min
,
_mm512_abs_epi8
(
zmm0
));
sgn
=
_mm512_xor_si512
(
sgn
,
zmm0
);
// Store result
min
=
_mm512_min_epu8
(
min
,
*
p_maxLLR
);
// 128 in epi8 is -127
...
...
@@ -876,1569 +862,4 @@ static inline void nrLDPC_cnProc_BG1_AVX512(t_nrLDPC_lut* p_lut, t_nrLDPC_procBu
}
}
#else
static
inline
void
nrLDPC_cnProc_BG1
(
t_nrLDPC_lut
*
p_lut
,
t_nrLDPC_procBuf
*
p_procBuf
,
uint16_t
Z
)
{
const
uint8_t
*
lut_numCnInCnGroups
=
p_lut
->
numCnInCnGroups
;
const
uint32_t
*
lut_startAddrCnGroups
=
p_lut
->
startAddrCnGroups
;
int8_t
*
cnProcBuf
=
p_procBuf
->
cnProcBuf
;
int8_t
*
cnProcBufRes
=
p_procBuf
->
cnProcBufRes
;
__m256i
*
p_cnProcBuf
;
__m256i
*
p_cnProcBufRes
;
// Number of CNs in Groups
uint32_t
M
;
uint32_t
i
;
uint32_t
j
;
uint32_t
k
;
// Offset to each bit within a group in terms of 32 Byte
uint32_t
bitOffsetInGroup
;
__m256i
ymm0
,
min
,
sgn
;
__m256i
*
p_cnProcBufResBit
;
const
__m256i
*
p_ones
=
(
__m256i
*
)
ones256_epi8
;
const
__m256i
*
p_maxLLR
=
(
__m256i
*
)
maxLLR256_epi8
;
// LUT with offsets for bits that need to be processed
// 1. bit proc requires LLRs of 2. and 3. bit, 2.bits of 1. and 3. etc.
// Offsets are in units of bitOffsetInGroup (1*384/32)
const
uint8_t
lut_idxCnProcG3
[
3
][
2
]
=
{{
12
,
24
},
{
0
,
24
},
{
0
,
12
}};
// =====================================================================
// Process group with 3 BNs
if
(
lut_numCnInCnGroups
[
0
]
>
0
)
{
// Number of groups of 32 CNs for parallel processing
// Ceil for values not divisible by 32
M
=
(
lut_numCnInCnGroups
[
0
]
*
Z
+
31
)
>>
5
;
// Set the offset to each bit within a group in terms of 32 Byte
bitOffsetInGroup
=
(
lut_numCnInCnGroups_BG1_R13
[
0
]
*
NR_LDPC_ZMAX
)
>>
5
;
// Set pointers to start of group 3
p_cnProcBuf
=
(
__m256i
*
)
&
cnProcBuf
[
lut_startAddrCnGroups
[
0
]];
p_cnProcBufRes
=
(
__m256i
*
)
&
cnProcBufRes
[
lut_startAddrCnGroups
[
0
]];
// Loop over every BN
for
(
j
=
0
;
j
<
3
;
j
++
)
{
// Set of results pointer to correct BN address
p_cnProcBufResBit
=
p_cnProcBufRes
+
(
j
*
bitOffsetInGroup
);
// Loop over CNs
for
(
i
=
0
;
i
<
M
;
i
++
)
{
// Abs and sign of 32 CNs (first BN)
ymm0
=
p_cnProcBuf
[
lut_idxCnProcG3
[
j
][
0
]
+
i
];
sgn
=
_mm256_sign_epi8
(
*
p_ones
,
ymm0
);
min
=
_mm256_abs_epi8
(
ymm0
);
// 32 CNs of second BN
ymm0
=
p_cnProcBuf
[
lut_idxCnProcG3
[
j
][
1
]
+
i
];
min
=
_mm256_min_epu8
(
min
,
_mm256_abs_epi8
(
ymm0
));
sgn
=
_mm256_sign_epi8
(
sgn
,
ymm0
);
// Store result
min
=
_mm256_min_epu8
(
min
,
*
p_maxLLR
);
// 128 in epi8 is -127
*
p_cnProcBufResBit
=
_mm256_sign_epi8
(
min
,
sgn
);
p_cnProcBufResBit
++
;
}
}
}
// =====================================================================
// Process group with 4 BNs
// Offset is 5*384/32 = 60
const
uint8_t
lut_idxCnProcG4
[
4
][
3
]
=
{{
60
,
120
,
180
},
{
0
,
120
,
180
},
{
0
,
60
,
180
},
{
0
,
60
,
120
}};
if
(
lut_numCnInCnGroups
[
1
]
>
0
)
{
// Number of groups of 32 CNs for parallel processing
// Ceil for values not divisible by 32
M
=
(
lut_numCnInCnGroups
[
1
]
*
Z
+
31
)
>>
5
;
// Set the offset to each bit within a group in terms of 32 Byte
bitOffsetInGroup
=
(
lut_numCnInCnGroups_BG1_R13
[
1
]
*
NR_LDPC_ZMAX
)
>>
5
;
// Set pointers to start of group 4
p_cnProcBuf
=
(
__m256i
*
)
&
cnProcBuf
[
lut_startAddrCnGroups
[
1
]];
p_cnProcBufRes
=
(
__m256i
*
)
&
cnProcBufRes
[
lut_startAddrCnGroups
[
1
]];
// Loop over every BN
for
(
j
=
0
;
j
<
4
;
j
++
)
{
// Set of results pointer to correct BN address
p_cnProcBufResBit
=
p_cnProcBufRes
+
(
j
*
bitOffsetInGroup
);
// Loop over CNs
for
(
i
=
0
;
i
<
M
;
i
++
)
{
// Abs and sign of 32 CNs (first BN)
ymm0
=
p_cnProcBuf
[
lut_idxCnProcG4
[
j
][
0
]
+
i
];
sgn
=
_mm256_sign_epi8
(
*
p_ones
,
ymm0
);
min
=
_mm256_abs_epi8
(
ymm0
);
// Loop over BNs
for
(
k
=
1
;
k
<
3
;
k
++
)
{
ymm0
=
p_cnProcBuf
[
lut_idxCnProcG4
[
j
][
k
]
+
i
];
min
=
_mm256_min_epu8
(
min
,
_mm256_abs_epi8
(
ymm0
));
sgn
=
_mm256_sign_epi8
(
sgn
,
ymm0
);
}
// Store result
min
=
_mm256_min_epu8
(
min
,
*
p_maxLLR
);
// 128 in epi8 is -127
*
p_cnProcBufResBit
=
_mm256_sign_epi8
(
min
,
sgn
);
p_cnProcBufResBit
++
;
}
}
}
// =====================================================================
// Process group with 5 BNs
// Offset is 18*384/32 = 216
const
uint16_t
lut_idxCnProcG5
[
5
][
4
]
=
{{
216
,
432
,
648
,
864
},
{
0
,
432
,
648
,
864
},
{
0
,
216
,
648
,
864
},
{
0
,
216
,
432
,
864
},
{
0
,
216
,
432
,
648
}};
if
(
lut_numCnInCnGroups
[
2
]
>
0
)
{
// Number of groups of 32 CNs for parallel processing
// Ceil for values not divisible by 32
M
=
(
lut_numCnInCnGroups
[
2
]
*
Z
+
31
)
>>
5
;
// Set the offset to each bit within a group in terms of 32 Byte
bitOffsetInGroup
=
(
lut_numCnInCnGroups_BG1_R13
[
2
]
*
NR_LDPC_ZMAX
)
>>
5
;
// Set pointers to start of group 5
p_cnProcBuf
=
(
__m256i
*
)
&
cnProcBuf
[
lut_startAddrCnGroups
[
2
]];
p_cnProcBufRes
=
(
__m256i
*
)
&
cnProcBufRes
[
lut_startAddrCnGroups
[
2
]];
// Loop over every BN
for
(
j
=
0
;
j
<
5
;
j
++
)
{
// Set of results pointer to correct BN address
p_cnProcBufResBit
=
p_cnProcBufRes
+
(
j
*
bitOffsetInGroup
);
// Loop over CNs
for
(
i
=
0
;
i
<
M
;
i
++
)
{
// Abs and sign of 32 CNs (first BN)
ymm0
=
p_cnProcBuf
[
lut_idxCnProcG5
[
j
][
0
]
+
i
];
sgn
=
_mm256_sign_epi8
(
*
p_ones
,
ymm0
);
min
=
_mm256_abs_epi8
(
ymm0
);
// Loop over BNs
for
(
k
=
1
;
k
<
4
;
k
++
)
{
ymm0
=
p_cnProcBuf
[
lut_idxCnProcG5
[
j
][
k
]
+
i
];
min
=
_mm256_min_epu8
(
min
,
_mm256_abs_epi8
(
ymm0
));
sgn
=
_mm256_sign_epi8
(
sgn
,
ymm0
);
}
// Store result
min
=
_mm256_min_epu8
(
min
,
*
p_maxLLR
);
// 128 in epi8 is -127
*
p_cnProcBufResBit
=
_mm256_sign_epi8
(
min
,
sgn
);
p_cnProcBufResBit
++
;
}
}
}
// =====================================================================
// Process group with 6 BNs
// Offset is 8*384/32 = 96
const
uint16_t
lut_idxCnProcG6
[
6
][
5
]
=
{{
96
,
192
,
288
,
384
,
480
},
{
0
,
192
,
288
,
384
,
480
},
{
0
,
96
,
288
,
384
,
480
},
{
0
,
96
,
192
,
384
,
480
},
{
0
,
96
,
192
,
288
,
480
},
{
0
,
96
,
192
,
288
,
384
}};
if
(
lut_numCnInCnGroups
[
3
]
>
0
)
{
// Number of groups of 32 CNs for parallel processing
// Ceil for values not divisible by 32
M
=
(
lut_numCnInCnGroups
[
3
]
*
Z
+
31
)
>>
5
;
// Set the offset to each bit within a group in terms of 32 Byte
bitOffsetInGroup
=
(
lut_numCnInCnGroups_BG1_R13
[
3
]
*
NR_LDPC_ZMAX
)
>>
5
;
// Set pointers to start of group 6
p_cnProcBuf
=
(
__m256i
*
)
&
cnProcBuf
[
lut_startAddrCnGroups
[
3
]];
p_cnProcBufRes
=
(
__m256i
*
)
&
cnProcBufRes
[
lut_startAddrCnGroups
[
3
]];
// Loop over every BN
for
(
j
=
0
;
j
<
6
;
j
++
)
{
// Set of results pointer to correct BN address
p_cnProcBufResBit
=
p_cnProcBufRes
+
(
j
*
bitOffsetInGroup
);
// Loop over CNs
for
(
i
=
0
;
i
<
M
;
i
++
)
{
// Abs and sign of 32 CNs (first BN)
ymm0
=
p_cnProcBuf
[
lut_idxCnProcG6
[
j
][
0
]
+
i
];
sgn
=
_mm256_sign_epi8
(
*
p_ones
,
ymm0
);
min
=
_mm256_abs_epi8
(
ymm0
);
// Loop over BNs
for
(
k
=
1
;
k
<
5
;
k
++
)
{
ymm0
=
p_cnProcBuf
[
lut_idxCnProcG6
[
j
][
k
]
+
i
];
min
=
_mm256_min_epu8
(
min
,
_mm256_abs_epi8
(
ymm0
));
sgn
=
_mm256_sign_epi8
(
sgn
,
ymm0
);
}
// Store result
min
=
_mm256_min_epu8
(
min
,
*
p_maxLLR
);
// 128 in epi8 is -127
*
p_cnProcBufResBit
=
_mm256_sign_epi8
(
min
,
sgn
);
p_cnProcBufResBit
++
;
}
}
}
// =====================================================================
// Process group with 7 BNs
// Offset is 5*384/32 = 60
const
uint16_t
lut_idxCnProcG7
[
7
][
6
]
=
{{
60
,
120
,
180
,
240
,
300
,
360
},
{
0
,
120
,
180
,
240
,
300
,
360
},
{
0
,
60
,
180
,
240
,
300
,
360
},
{
0
,
60
,
120
,
240
,
300
,
360
},
{
0
,
60
,
120
,
180
,
300
,
360
},
{
0
,
60
,
120
,
180
,
240
,
360
},
{
0
,
60
,
120
,
180
,
240
,
300
}};
if
(
lut_numCnInCnGroups
[
4
]
>
0
)
{
// Number of groups of 32 CNs for parallel processing
// Ceil for values not divisible by 32
M
=
(
lut_numCnInCnGroups
[
4
]
*
Z
+
31
)
>>
5
;
// Set the offset to each bit within a group in terms of 32 Byte
bitOffsetInGroup
=
(
lut_numCnInCnGroups_BG1_R13
[
4
]
*
NR_LDPC_ZMAX
)
>>
5
;
// Set pointers to start of group 7
p_cnProcBuf
=
(
__m256i
*
)
&
cnProcBuf
[
lut_startAddrCnGroups
[
4
]];
p_cnProcBufRes
=
(
__m256i
*
)
&
cnProcBufRes
[
lut_startAddrCnGroups
[
4
]];
// Loop over every BN
for
(
j
=
0
;
j
<
7
;
j
++
)
{
// Set of results pointer to correct BN address
p_cnProcBufResBit
=
p_cnProcBufRes
+
(
j
*
bitOffsetInGroup
);
// Loop over CNs
for
(
i
=
0
;
i
<
M
;
i
++
)
{
// Abs and sign of 32 CNs (first BN)
ymm0
=
p_cnProcBuf
[
lut_idxCnProcG7
[
j
][
0
]
+
i
];
sgn
=
_mm256_sign_epi8
(
*
p_ones
,
ymm0
);
min
=
_mm256_abs_epi8
(
ymm0
);
// Loop over BNs
for
(
k
=
1
;
k
<
6
;
k
++
)
{
ymm0
=
p_cnProcBuf
[
lut_idxCnProcG7
[
j
][
k
]
+
i
];
min
=
_mm256_min_epu8
(
min
,
_mm256_abs_epi8
(
ymm0
));
sgn
=
_mm256_sign_epi8
(
sgn
,
ymm0
);
}
// Store result
min
=
_mm256_min_epu8
(
min
,
*
p_maxLLR
);
// 128 in epi8 is -127
*
p_cnProcBufResBit
=
_mm256_sign_epi8
(
min
,
sgn
);
p_cnProcBufResBit
++
;
}
}
}
// =====================================================================
// Process group with 8 BNs
// Offset is 2*384/32 = 24
const
uint8_t
lut_idxCnProcG8
[
8
][
7
]
=
{{
24
,
48
,
72
,
96
,
120
,
144
,
168
},
{
0
,
48
,
72
,
96
,
120
,
144
,
168
},
{
0
,
24
,
72
,
96
,
120
,
144
,
168
},
{
0
,
24
,
48
,
96
,
120
,
144
,
168
},
{
0
,
24
,
48
,
72
,
120
,
144
,
168
},
{
0
,
24
,
48
,
72
,
96
,
144
,
168
},
{
0
,
24
,
48
,
72
,
96
,
120
,
168
},
{
0
,
24
,
48
,
72
,
96
,
120
,
144
}};
if
(
lut_numCnInCnGroups
[
5
]
>
0
)
{
// Number of groups of 32 CNs for parallel processing
// Ceil for values not divisible by 32
M
=
(
lut_numCnInCnGroups
[
5
]
*
Z
+
31
)
>>
5
;
// Set the offset to each bit within a group in terms of 32 Byte
bitOffsetInGroup
=
(
lut_numCnInCnGroups_BG1_R13
[
5
]
*
NR_LDPC_ZMAX
)
>>
5
;
// Set pointers to start of group 8
p_cnProcBuf
=
(
__m256i
*
)
&
cnProcBuf
[
lut_startAddrCnGroups
[
5
]];
p_cnProcBufRes
=
(
__m256i
*
)
&
cnProcBufRes
[
lut_startAddrCnGroups
[
5
]];
// Loop over every BN
for
(
j
=
0
;
j
<
8
;
j
++
)
{
// Set of results pointer to correct BN address
p_cnProcBufResBit
=
p_cnProcBufRes
+
(
j
*
bitOffsetInGroup
);
// Loop over CNs
for
(
i
=
0
;
i
<
M
;
i
++
)
{
// Abs and sign of 32 CNs (first BN)
ymm0
=
p_cnProcBuf
[
lut_idxCnProcG8
[
j
][
0
]
+
i
];
sgn
=
_mm256_sign_epi8
(
*
p_ones
,
ymm0
);
min
=
_mm256_abs_epi8
(
ymm0
);
// Loop over BNs
for
(
k
=
1
;
k
<
7
;
k
++
)
{
ymm0
=
p_cnProcBuf
[
lut_idxCnProcG8
[
j
][
k
]
+
i
];
min
=
_mm256_min_epu8
(
min
,
_mm256_abs_epi8
(
ymm0
));
sgn
=
_mm256_sign_epi8
(
sgn
,
ymm0
);
}
// Store result
min
=
_mm256_min_epu8
(
min
,
*
p_maxLLR
);
// 128 in epi8 is -127
*
p_cnProcBufResBit
=
_mm256_sign_epi8
(
min
,
sgn
);
p_cnProcBufResBit
++
;
}
}
}
// =====================================================================
// Process group with 9 BNs
// Offset is 2*384/32 = 24
const
uint8_t
lut_idxCnProcG9
[
9
][
8
]
=
{{
24
,
48
,
72
,
96
,
120
,
144
,
168
,
192
},
{
0
,
48
,
72
,
96
,
120
,
144
,
168
,
192
},
{
0
,
24
,
72
,
96
,
120
,
144
,
168
,
192
},
{
0
,
24
,
48
,
96
,
120
,
144
,
168
,
192
},
{
0
,
24
,
48
,
72
,
120
,
144
,
168
,
192
},
{
0
,
24
,
48
,
72
,
96
,
144
,
168
,
192
},
{
0
,
24
,
48
,
72
,
96
,
120
,
168
,
192
},
{
0
,
24
,
48
,
72
,
96
,
120
,
144
,
192
},
{
0
,
24
,
48
,
72
,
96
,
120
,
144
,
168
}};
if
(
lut_numCnInCnGroups
[
6
]
>
0
)
{
// Number of groups of 32 CNs for parallel processing
// Ceil for values not divisible by 32
M
=
(
lut_numCnInCnGroups
[
6
]
*
Z
+
31
)
>>
5
;
// Set the offset to each bit within a group in terms of 32 Byte
bitOffsetInGroup
=
(
lut_numCnInCnGroups_BG1_R13
[
6
]
*
NR_LDPC_ZMAX
)
>>
5
;
// Set pointers to start of group 9
p_cnProcBuf
=
(
__m256i
*
)
&
cnProcBuf
[
lut_startAddrCnGroups
[
6
]];
p_cnProcBufRes
=
(
__m256i
*
)
&
cnProcBufRes
[
lut_startAddrCnGroups
[
6
]];
// Loop over every BN
for
(
j
=
0
;
j
<
9
;
j
++
)
{
// Set of results pointer to correct BN address
p_cnProcBufResBit
=
p_cnProcBufRes
+
(
j
*
bitOffsetInGroup
);
// Loop over CNs
for
(
i
=
0
;
i
<
M
;
i
++
)
{
// Abs and sign of 32 CNs (first BN)
ymm0
=
p_cnProcBuf
[
lut_idxCnProcG9
[
j
][
0
]
+
i
];
sgn
=
_mm256_sign_epi8
(
*
p_ones
,
ymm0
);
min
=
_mm256_abs_epi8
(
ymm0
);
// Loop over BNs
for
(
k
=
1
;
k
<
8
;
k
++
)
{
ymm0
=
p_cnProcBuf
[
lut_idxCnProcG9
[
j
][
k
]
+
i
];
min
=
_mm256_min_epu8
(
min
,
_mm256_abs_epi8
(
ymm0
));
sgn
=
_mm256_sign_epi8
(
sgn
,
ymm0
);
}
// Store result
min
=
_mm256_min_epu8
(
min
,
*
p_maxLLR
);
// 128 in epi8 is -127
*
p_cnProcBufResBit
=
_mm256_sign_epi8
(
min
,
sgn
);
p_cnProcBufResBit
++
;
}
}
}
// =====================================================================
// Process group with 10 BNs
// Offset is 1*384/32 = 12
const
uint8_t
lut_idxCnProcG10
[
10
][
9
]
=
{{
12
,
24
,
36
,
48
,
60
,
72
,
84
,
96
,
108
},
{
0
,
24
,
36
,
48
,
60
,
72
,
84
,
96
,
108
},
{
0
,
12
,
36
,
48
,
60
,
72
,
84
,
96
,
108
},
{
0
,
12
,
24
,
48
,
60
,
72
,
84
,
96
,
108
},
{
0
,
12
,
24
,
36
,
60
,
72
,
84
,
96
,
108
},
{
0
,
12
,
24
,
36
,
48
,
72
,
84
,
96
,
108
},
{
0
,
12
,
24
,
36
,
48
,
60
,
84
,
96
,
108
},
{
0
,
12
,
24
,
36
,
48
,
60
,
72
,
96
,
108
},
{
0
,
12
,
24
,
36
,
48
,
60
,
72
,
84
,
108
},
{
0
,
12
,
24
,
36
,
48
,
60
,
72
,
84
,
96
}};
if
(
lut_numCnInCnGroups
[
7
]
>
0
)
{
// Number of groups of 32 CNs for parallel processing
// Ceil for values not divisible by 32
M
=
(
lut_numCnInCnGroups
[
7
]
*
Z
+
31
)
>>
5
;
// Set the offset to each bit within a group in terms of 32 Byte
bitOffsetInGroup
=
(
lut_numCnInCnGroups_BG1_R13
[
7
]
*
NR_LDPC_ZMAX
)
>>
5
;
// Set pointers to start of group 10
p_cnProcBuf
=
(
__m256i
*
)
&
cnProcBuf
[
lut_startAddrCnGroups
[
7
]];
p_cnProcBufRes
=
(
__m256i
*
)
&
cnProcBufRes
[
lut_startAddrCnGroups
[
7
]];
// Loop over every BN
for
(
j
=
0
;
j
<
10
;
j
++
)
{
// Set of results pointer to correct BN address
p_cnProcBufResBit
=
p_cnProcBufRes
+
(
j
*
bitOffsetInGroup
);
// Loop over CNs
for
(
i
=
0
;
i
<
M
;
i
++
)
{
// Abs and sign of 32 CNs (first BN)
ymm0
=
p_cnProcBuf
[
lut_idxCnProcG10
[
j
][
0
]
+
i
];
sgn
=
_mm256_sign_epi8
(
*
p_ones
,
ymm0
);
min
=
_mm256_abs_epi8
(
ymm0
);
// Loop over BNs
for
(
k
=
1
;
k
<
9
;
k
++
)
{
ymm0
=
p_cnProcBuf
[
lut_idxCnProcG10
[
j
][
k
]
+
i
];
min
=
_mm256_min_epu8
(
min
,
_mm256_abs_epi8
(
ymm0
));
sgn
=
_mm256_sign_epi8
(
sgn
,
ymm0
);
}
// Store result
min
=
_mm256_min_epu8
(
min
,
*
p_maxLLR
);
// 128 in epi8 is -127
*
p_cnProcBufResBit
=
_mm256_sign_epi8
(
min
,
sgn
);
p_cnProcBufResBit
++
;
}
}
}
// =====================================================================
// Process group with 19 BNs
// Offset is 4*384/32 = 12
const
uint16_t
lut_idxCnProcG19
[
19
][
18
]
=
{{
48
,
96
,
144
,
192
,
240
,
288
,
336
,
384
,
432
,
480
,
528
,
576
,
624
,
672
,
720
,
768
,
816
,
864
},
{
0
,
96
,
144
,
192
,
240
,
288
,
336
,
384
,
432
,
480
,
528
,
576
,
624
,
672
,
720
,
768
,
816
,
864
},
{
0
,
48
,
144
,
192
,
240
,
288
,
336
,
384
,
432
,
480
,
528
,
576
,
624
,
672
,
720
,
768
,
816
,
864
},
{
0
,
48
,
96
,
192
,
240
,
288
,
336
,
384
,
432
,
480
,
528
,
576
,
624
,
672
,
720
,
768
,
816
,
864
},
{
0
,
48
,
96
,
144
,
240
,
288
,
336
,
384
,
432
,
480
,
528
,
576
,
624
,
672
,
720
,
768
,
816
,
864
},
{
0
,
48
,
96
,
144
,
192
,
288
,
336
,
384
,
432
,
480
,
528
,
576
,
624
,
672
,
720
,
768
,
816
,
864
},
{
0
,
48
,
96
,
144
,
192
,
240
,
336
,
384
,
432
,
480
,
528
,
576
,
624
,
672
,
720
,
768
,
816
,
864
},
{
0
,
48
,
96
,
144
,
192
,
240
,
288
,
384
,
432
,
480
,
528
,
576
,
624
,
672
,
720
,
768
,
816
,
864
},
{
0
,
48
,
96
,
144
,
192
,
240
,
288
,
336
,
432
,
480
,
528
,
576
,
624
,
672
,
720
,
768
,
816
,
864
},
{
0
,
48
,
96
,
144
,
192
,
240
,
288
,
336
,
384
,
480
,
528
,
576
,
624
,
672
,
720
,
768
,
816
,
864
},
{
0
,
48
,
96
,
144
,
192
,
240
,
288
,
336
,
384
,
432
,
528
,
576
,
624
,
672
,
720
,
768
,
816
,
864
},
{
0
,
48
,
96
,
144
,
192
,
240
,
288
,
336
,
384
,
432
,
480
,
576
,
624
,
672
,
720
,
768
,
816
,
864
},
{
0
,
48
,
96
,
144
,
192
,
240
,
288
,
336
,
384
,
432
,
480
,
528
,
624
,
672
,
720
,
768
,
816
,
864
},
{
0
,
48
,
96
,
144
,
192
,
240
,
288
,
336
,
384
,
432
,
480
,
528
,
576
,
672
,
720
,
768
,
816
,
864
},
{
0
,
48
,
96
,
144
,
192
,
240
,
288
,
336
,
384
,
432
,
480
,
528
,
576
,
624
,
720
,
768
,
816
,
864
},
{
0
,
48
,
96
,
144
,
192
,
240
,
288
,
336
,
384
,
432
,
480
,
528
,
576
,
624
,
672
,
768
,
816
,
864
},
{
0
,
48
,
96
,
144
,
192
,
240
,
288
,
336
,
384
,
432
,
480
,
528
,
576
,
624
,
672
,
720
,
816
,
864
},
{
0
,
48
,
96
,
144
,
192
,
240
,
288
,
336
,
384
,
432
,
480
,
528
,
576
,
624
,
672
,
720
,
768
,
864
},
{
0
,
48
,
96
,
144
,
192
,
240
,
288
,
336
,
384
,
432
,
480
,
528
,
576
,
624
,
672
,
720
,
768
,
816
}};
if
(
lut_numCnInCnGroups
[
8
]
>
0
)
{
// Number of groups of 32 CNs for parallel processing
// Ceil for values not divisible by 32
M
=
(
lut_numCnInCnGroups
[
8
]
*
Z
+
31
)
>>
5
;
// Set the offset to each bit within a group in terms of 32 Byte
bitOffsetInGroup
=
(
lut_numCnInCnGroups_BG1_R13
[
8
]
*
NR_LDPC_ZMAX
)
>>
5
;
// Set pointers to start of group 19
p_cnProcBuf
=
(
__m256i
*
)
&
cnProcBuf
[
lut_startAddrCnGroups
[
8
]];
p_cnProcBufRes
=
(
__m256i
*
)
&
cnProcBufRes
[
lut_startAddrCnGroups
[
8
]];
// Loop over every BN
for
(
j
=
0
;
j
<
19
;
j
++
)
{
// Set of results pointer to correct BN address
p_cnProcBufResBit
=
p_cnProcBufRes
+
(
j
*
bitOffsetInGroup
);
// Loop over CNs
for
(
i
=
0
;
i
<
M
;
i
++
)
{
// Abs and sign of 32 CNs (first BN)
ymm0
=
p_cnProcBuf
[
lut_idxCnProcG19
[
j
][
0
]
+
i
];
sgn
=
_mm256_sign_epi8
(
*
p_ones
,
ymm0
);
min
=
_mm256_abs_epi8
(
ymm0
);
// Loop over BNs
for
(
k
=
1
;
k
<
18
;
k
++
)
{
ymm0
=
p_cnProcBuf
[
lut_idxCnProcG19
[
j
][
k
]
+
i
];
min
=
_mm256_min_epu8
(
min
,
_mm256_abs_epi8
(
ymm0
));
sgn
=
_mm256_sign_epi8
(
sgn
,
ymm0
);
}
// Store result
min
=
_mm256_min_epu8
(
min
,
*
p_maxLLR
);
// 128 in epi8 is -127
*
p_cnProcBufResBit
=
_mm256_sign_epi8
(
min
,
sgn
);
p_cnProcBufResBit
++
;
}
}
}
}
#endif
/**
\brief Performs parity check for BG1 on the CN processing buffer. Stops as soon as error is detected.
\param p_lut Pointer to decoder LUTs
\param Z Lifting size
\return 32-bit parity check indicator
*/
static
inline
uint32_t
nrLDPC_cnProcPc_BG1
(
t_nrLDPC_lut
*
p_lut
,
t_nrLDPC_procBuf
*
p_procBuf
,
uint16_t
Z
)
{
const
uint8_t
*
lut_numCnInCnGroups
=
p_lut
->
numCnInCnGroups
;
const
uint32_t
*
lut_startAddrCnGroups
=
p_lut
->
startAddrCnGroups
;
int8_t
*
cnProcBuf
=
p_procBuf
->
cnProcBuf
;
int8_t
*
cnProcBufRes
=
p_procBuf
->
cnProcBufRes
;
__m256i
*
p_cnProcBuf
;
__m256i
*
p_cnProcBufRes
;
// Number of CNs in Groups
uint32_t
M
;
uint32_t
i
;
uint32_t
j
;
uint32_t
pcRes
=
0
;
uint32_t
pcResSum
=
0
;
uint32_t
Mrem
;
uint32_t
M32
;
__m256i
ymm0
,
ymm1
;
// =====================================================================
// Process group with 3 BNs
if
(
lut_numCnInCnGroups
[
0
]
>
0
)
{
// Reset results
pcResSum
=
0
;
// Number of CNs in group
M
=
lut_numCnInCnGroups
[
0
]
*
Z
;
// Remainder modulo 32
Mrem
=
M
&
31
;
// Number of groups of 32 CNs for parallel processing
// Ceil for values not divisible by 32
M32
=
(
M
+
31
)
>>
5
;
// Set pointers to start of group 3
p_cnProcBuf
=
(
__m256i
*
)
&
cnProcBuf
[
lut_startAddrCnGroups
[
0
]];
p_cnProcBufRes
=
(
__m256i
*
)
&
cnProcBufRes
[
lut_startAddrCnGroups
[
0
]];
// Loop over CNs
for
(
i
=
0
;
i
<
(
M32
-
1
);
i
++
)
{
pcRes
=
0
;
// Loop over every BN
// Compute PC for 32 CNs at once
for
(
j
=
0
;
j
<
3
;
j
++
)
{
// BN offset is units of (1*384/32) = 12
ymm0
=
p_cnProcBuf
[
j
*
12
+
i
];
ymm1
=
p_cnProcBufRes
[
j
*
12
+
i
];
// Add BN and input LLR, extract the sign bit
// and add in GF(2) (xor)
pcRes
^=
_mm256_movemask_epi8
(
_mm256_adds_epi8
(
ymm0
,
ymm1
));
}
// If no error pcRes should be 0
pcResSum
|=
pcRes
;
}
// Last 32 CNs might not be full valid 32 depending on Z
pcRes
=
0
;
// Loop over every BN
// Compute PC for 32 CNs at once
for
(
j
=
0
;
j
<
3
;
j
++
)
{
// BN offset is units of (1*384/32) = 12
ymm0
=
p_cnProcBuf
[
j
*
12
+
i
];
ymm1
=
p_cnProcBufRes
[
j
*
12
+
i
];
// Add BN and input LLR, extract the sign bit
// and add in GF(2) (xor)
pcRes
^=
_mm256_movemask_epi8
(
_mm256_adds_epi8
(
ymm0
,
ymm1
));
}
// If no error pcRes should be 0
// Only use valid CNs
pcResSum
|=
(
pcRes
&
(
0xFFFFFFFF
>>
(
32
-
Mrem
)));
// If PC failed we can stop here
if
(
pcResSum
>
0
)
{
return
pcResSum
;
}
}
// =====================================================================
// Process group with 4 BNs
if
(
lut_numCnInCnGroups
[
1
]
>
0
)
{
// Reset results
pcResSum
=
0
;
// Number of CNs in group
M
=
lut_numCnInCnGroups
[
1
]
*
Z
;
// Remainder modulo 32
Mrem
=
M
&
31
;
// Number of groups of 32 CNs for parallel processing
// Ceil for values not divisible by 32
M32
=
(
M
+
31
)
>>
5
;
// Set pointers to start of group 4
p_cnProcBuf
=
(
__m256i
*
)
&
cnProcBuf
[
lut_startAddrCnGroups
[
1
]];
p_cnProcBufRes
=
(
__m256i
*
)
&
cnProcBufRes
[
lut_startAddrCnGroups
[
1
]];
// Loop over CNs
for
(
i
=
0
;
i
<
(
M32
-
1
);
i
++
)
{
pcRes
=
0
;
// Loop over every BN
// Compute PC for 32 CNs at once
for
(
j
=
0
;
j
<
4
;
j
++
)
{
// BN offset is units of 5*384/32 = 60
ymm0
=
p_cnProcBuf
[
j
*
60
+
i
];
ymm1
=
p_cnProcBufRes
[
j
*
60
+
i
];
// Add BN and input LLR, extract the sign bit
// and add in GF(2) (xor)
pcRes
^=
_mm256_movemask_epi8
(
_mm256_adds_epi8
(
ymm0
,
ymm1
));
}
// If no error pcRes should be 0
pcResSum
|=
pcRes
;
}
// Last 32 CNs might not be full valid 32 depending on Z
pcRes
=
0
;
// Loop over every BN
// Compute PC for 32 CNs at once
for
(
j
=
0
;
j
<
4
;
j
++
)
{
// BN offset is units of 5*384/32 = 60
ymm0
=
p_cnProcBuf
[
j
*
60
+
i
];
ymm1
=
p_cnProcBufRes
[
j
*
60
+
i
];
// Add BN and input LLR, extract the sign bit
// and add in GF(2) (xor)
pcRes
^=
_mm256_movemask_epi8
(
_mm256_adds_epi8
(
ymm0
,
ymm1
));
}
// If no error pcRes should be 0
// Only use valid CNs
pcResSum
|=
(
pcRes
&
(
0xFFFFFFFF
>>
(
32
-
Mrem
)));
// If PC failed we can stop here
if
(
pcResSum
>
0
)
{
return
pcResSum
;
}
}
// =====================================================================
// Process group with 5 BNs
if
(
lut_numCnInCnGroups
[
2
]
>
0
)
{
// Reset results
pcResSum
=
0
;
// Number of CNs in group
M
=
lut_numCnInCnGroups
[
2
]
*
Z
;
// Remainder modulo 32
Mrem
=
M
&
31
;
// Number of groups of 32 CNs for parallel processing
// Ceil for values not divisible by 32
M32
=
(
M
+
31
)
>>
5
;
// Set pointers to start of group 5
p_cnProcBuf
=
(
__m256i
*
)
&
cnProcBuf
[
lut_startAddrCnGroups
[
2
]];
p_cnProcBufRes
=
(
__m256i
*
)
&
cnProcBufRes
[
lut_startAddrCnGroups
[
2
]];
// Loop over CNs
for
(
i
=
0
;
i
<
(
M32
-
1
);
i
++
)
{
pcRes
=
0
;
// Loop over every BN
// Compute PC for 32 CNs at once
for
(
j
=
0
;
j
<
5
;
j
++
)
{
// BN offset is units of 18*384/32 = 216
ymm0
=
p_cnProcBuf
[
j
*
216
+
i
];
ymm1
=
p_cnProcBufRes
[
j
*
216
+
i
];
// Add BN and input LLR, extract the sign bit
// and add in GF(2) (xor)
pcRes
^=
_mm256_movemask_epi8
(
_mm256_adds_epi8
(
ymm0
,
ymm1
));
}
// If no error pcRes should be 0
pcResSum
|=
pcRes
;
}
// Last 32 CNs might not be full valid 32 depending on Z
pcRes
=
0
;
// Loop over every BN
// Compute PC for 32 CNs at once
for
(
j
=
0
;
j
<
5
;
j
++
)
{
// BN offset is units of 18*384/32 = 216
ymm0
=
p_cnProcBuf
[
j
*
216
+
i
];
ymm1
=
p_cnProcBufRes
[
j
*
216
+
i
];
// Add BN and input LLR, extract the sign bit
// and add in GF(2) (xor)
pcRes
^=
_mm256_movemask_epi8
(
_mm256_adds_epi8
(
ymm0
,
ymm1
));
}
// If no error pcRes should be 0
// Only use valid CNs
pcResSum
|=
(
pcRes
&
(
0xFFFFFFFF
>>
(
32
-
Mrem
)));
// If PC failed we can stop here
if
(
pcResSum
>
0
)
{
return
pcResSum
;
}
}
// =====================================================================
// Process group with 6 BNs
if
(
lut_numCnInCnGroups
[
3
]
>
0
)
{
// Reset results
pcResSum
=
0
;
// Number of CNs in group
M
=
lut_numCnInCnGroups
[
3
]
*
Z
;
// Remainder modulo 32
Mrem
=
M
&
31
;
// Number of groups of 32 CNs for parallel processing
// Ceil for values not divisible by 32
M32
=
(
M
+
31
)
>>
5
;
// Set pointers to start of group 6
p_cnProcBuf
=
(
__m256i
*
)
&
cnProcBuf
[
lut_startAddrCnGroups
[
3
]];
p_cnProcBufRes
=
(
__m256i
*
)
&
cnProcBufRes
[
lut_startAddrCnGroups
[
3
]];
// Loop over CNs
for
(
i
=
0
;
i
<
(
M32
-
1
);
i
++
)
{
pcRes
=
0
;
// Loop over every BN
// Compute PC for 32 CNs at once
for
(
j
=
0
;
j
<
6
;
j
++
)
{
// BN offset is units of 8*384/32 = 96
ymm0
=
p_cnProcBuf
[
j
*
96
+
i
];
ymm1
=
p_cnProcBufRes
[
j
*
96
+
i
];
// Add BN and input LLR, extract the sign bit
// and add in GF(2) (xor)
pcRes
^=
_mm256_movemask_epi8
(
_mm256_adds_epi8
(
ymm0
,
ymm1
));
}
// If no error pcRes should be 0
pcResSum
|=
pcRes
;
}
// Last 32 CNs might not be full valid 32 depending on Z
pcRes
=
0
;
// Loop over every BN
// Compute PC for 32 CNs at once
for
(
j
=
0
;
j
<
6
;
j
++
)
{
// BN offset is units of 8*384/32 = 96
ymm0
=
p_cnProcBuf
[
j
*
96
+
i
];
ymm1
=
p_cnProcBufRes
[
j
*
96
+
i
];
// Add BN and input LLR, extract the sign bit
// and add in GF(2) (xor)
pcRes
^=
_mm256_movemask_epi8
(
_mm256_adds_epi8
(
ymm0
,
ymm1
));
}
// If no error pcRes should be 0
// Only use valid CNs
pcResSum
|=
(
pcRes
&
(
0xFFFFFFFF
>>
(
32
-
Mrem
)));
// If PC failed we can stop here
if
(
pcResSum
>
0
)
{
return
pcResSum
;
}
}
// =====================================================================
// Process group with 7 BNs
if
(
lut_numCnInCnGroups
[
4
]
>
0
)
{
// Reset results
pcResSum
=
0
;
// Number of CNs in group
M
=
lut_numCnInCnGroups
[
4
]
*
Z
;
// Remainder modulo 32
Mrem
=
M
&
31
;
// Number of groups of 32 CNs for parallel processing
// Ceil for values not divisible by 32
M32
=
(
M
+
31
)
>>
5
;
// Set pointers to start of group 7
p_cnProcBuf
=
(
__m256i
*
)
&
cnProcBuf
[
lut_startAddrCnGroups
[
4
]];
p_cnProcBufRes
=
(
__m256i
*
)
&
cnProcBufRes
[
lut_startAddrCnGroups
[
4
]];
// Loop over CNs
for
(
i
=
0
;
i
<
(
M32
-
1
);
i
++
)
{
pcRes
=
0
;
// Loop over every BN
// Compute PC for 32 CNs at once
for
(
j
=
0
;
j
<
7
;
j
++
)
{
// BN offset is units of 5*384/32 = 60
ymm0
=
p_cnProcBuf
[
j
*
60
+
i
];
ymm1
=
p_cnProcBufRes
[
j
*
60
+
i
];
// Add BN and input LLR, extract the sign bit
// and add in GF(2) (xor)
pcRes
^=
_mm256_movemask_epi8
(
_mm256_adds_epi8
(
ymm0
,
ymm1
));
}
// If no error pcRes should be 0
pcResSum
|=
pcRes
;
}
// Last 32 CNs might not be full valid 32 depending on Z
pcRes
=
0
;
// Loop over every BN
// Compute PC for 32 CNs at once
for
(
j
=
0
;
j
<
7
;
j
++
)
{
// BN offset is units of 5*384/32 = 60
ymm0
=
p_cnProcBuf
[
j
*
60
+
i
];
ymm1
=
p_cnProcBufRes
[
j
*
60
+
i
];
// Add BN and input LLR, extract the sign bit
// and add in GF(2) (xor)
pcRes
^=
_mm256_movemask_epi8
(
_mm256_adds_epi8
(
ymm0
,
ymm1
));
}
// If no error pcRes should be 0
// Only use valid CNs
pcResSum
|=
(
pcRes
&
(
0xFFFFFFFF
>>
(
32
-
Mrem
)));
// If PC failed we can stop here
if
(
pcResSum
>
0
)
{
return
pcResSum
;
}
}
// =====================================================================
// Process group with 8 BNs
if
(
lut_numCnInCnGroups
[
5
]
>
0
)
{
// Reset results
pcResSum
=
0
;
// Number of CNs in group
M
=
lut_numCnInCnGroups
[
5
]
*
Z
;
// Remainder modulo 32
Mrem
=
M
&
31
;
// Number of groups of 32 CNs for parallel processing
// Ceil for values not divisible by 32
M32
=
(
M
+
31
)
>>
5
;
// Set pointers to start of group 8
p_cnProcBuf
=
(
__m256i
*
)
&
cnProcBuf
[
lut_startAddrCnGroups
[
5
]];
p_cnProcBufRes
=
(
__m256i
*
)
&
cnProcBufRes
[
lut_startAddrCnGroups
[
5
]];
// Loop over CNs
for
(
i
=
0
;
i
<
(
M32
-
1
);
i
++
)
{
pcRes
=
0
;
// Loop over every BN
// Compute PC for 32 CNs at once
for
(
j
=
0
;
j
<
8
;
j
++
)
{
// BN offset is units of 2*384/32 = 24
ymm0
=
p_cnProcBuf
[
j
*
24
+
i
];
ymm1
=
p_cnProcBufRes
[
j
*
24
+
i
];
// Add BN and input LLR, extract the sign bit
// and add in GF(2) (xor)
pcRes
^=
_mm256_movemask_epi8
(
_mm256_adds_epi8
(
ymm0
,
ymm1
));
}
// If no error pcRes should be 0
pcResSum
|=
pcRes
;
}
// Last 32 CNs might not be full valid 32 depending on Z
pcRes
=
0
;
// Loop over every BN
// Compute PC for 32 CNs at once
for
(
j
=
0
;
j
<
8
;
j
++
)
{
// BN offset is units of 2*384/32 = 24
ymm0
=
p_cnProcBuf
[
j
*
24
+
i
];
ymm1
=
p_cnProcBufRes
[
j
*
24
+
i
];
// Add BN and input LLR, extract the sign bit
// and add in GF(2) (xor)
pcRes
^=
_mm256_movemask_epi8
(
_mm256_adds_epi8
(
ymm0
,
ymm1
));
}
// If no error pcRes should be 0
// Only use valid CNs
pcResSum
|=
(
pcRes
&
(
0xFFFFFFFF
>>
(
32
-
Mrem
)));
// If PC failed we can stop here
if
(
pcResSum
>
0
)
{
return
pcResSum
;
}
}
// =====================================================================
// Process group with 9 BNs
if
(
lut_numCnInCnGroups
[
6
]
>
0
)
{
// Reset results
pcResSum
=
0
;
// Number of CNs in group
M
=
lut_numCnInCnGroups
[
6
]
*
Z
;
// Remainder modulo 32
Mrem
=
M
&
31
;
// Number of groups of 32 CNs for parallel processing
// Ceil for values not divisible by 32
M32
=
(
M
+
31
)
>>
5
;
// Set pointers to start of group 9
p_cnProcBuf
=
(
__m256i
*
)
&
cnProcBuf
[
lut_startAddrCnGroups
[
6
]];
p_cnProcBufRes
=
(
__m256i
*
)
&
cnProcBufRes
[
lut_startAddrCnGroups
[
6
]];
// Loop over CNs
for
(
i
=
0
;
i
<
(
M32
-
1
);
i
++
)
{
pcRes
=
0
;
// Loop over every BN
// Compute PC for 32 CNs at once
for
(
j
=
0
;
j
<
9
;
j
++
)
{
// BN offset is units of 2*384/32 = 24
ymm0
=
p_cnProcBuf
[
j
*
24
+
i
];
ymm1
=
p_cnProcBufRes
[
j
*
24
+
i
];
// Add BN and input LLR, extract the sign bit
// and add in GF(2) (xor)
pcRes
^=
_mm256_movemask_epi8
(
_mm256_adds_epi8
(
ymm0
,
ymm1
));
}
// If no error pcRes should be 0
pcResSum
|=
pcRes
;
}
// Last 32 CNs might not be full valid 32 depending on Z
pcRes
=
0
;
// Loop over every BN
// Compute PC for 32 CNs at once
for
(
j
=
0
;
j
<
9
;
j
++
)
{
// BN offset is units of 2*384/32 = 24
ymm0
=
p_cnProcBuf
[
j
*
24
+
i
];
ymm1
=
p_cnProcBufRes
[
j
*
24
+
i
];
// Add BN and input LLR, extract the sign bit
// and add in GF(2) (xor)
pcRes
^=
_mm256_movemask_epi8
(
_mm256_adds_epi8
(
ymm0
,
ymm1
));
}
// If no error pcRes should be 0
// Only use valid CNs
pcResSum
|=
(
pcRes
&
(
0xFFFFFFFF
>>
(
32
-
Mrem
)));
// If PC failed we can stop here
if
(
pcResSum
>
0
)
{
return
pcResSum
;
}
}
// =====================================================================
// Process group with 10 BNs
if
(
lut_numCnInCnGroups
[
7
]
>
0
)
{
// Reset results
pcResSum
=
0
;
// Number of CNs in group
M
=
lut_numCnInCnGroups
[
7
]
*
Z
;
// Remainder modulo 32
Mrem
=
M
&
31
;
// Number of groups of 32 CNs for parallel processing
// Ceil for values not divisible by 32
M32
=
(
M
+
31
)
>>
5
;
// Set pointers to start of group 10
p_cnProcBuf
=
(
__m256i
*
)
&
cnProcBuf
[
lut_startAddrCnGroups
[
7
]];
p_cnProcBufRes
=
(
__m256i
*
)
&
cnProcBufRes
[
lut_startAddrCnGroups
[
7
]];
// Loop over CNs
for
(
i
=
0
;
i
<
(
M32
-
1
);
i
++
)
{
pcRes
=
0
;
// Loop over every BN
// Compute PC for 32 CNs at once
for
(
j
=
0
;
j
<
10
;
j
++
)
{
// BN offset is units of 1*384/32 = 12
ymm0
=
p_cnProcBuf
[
j
*
12
+
i
];
ymm1
=
p_cnProcBufRes
[
j
*
12
+
i
];
// Add BN and input LLR, extract the sign bit
// and add in GF(2) (xor)
pcRes
^=
_mm256_movemask_epi8
(
_mm256_adds_epi8
(
ymm0
,
ymm1
));
}
// If no error pcRes should be 0
pcResSum
|=
pcRes
;
}
// Last 32 CNs might not be full valid 32 depending on Z
pcRes
=
0
;
// Loop over every BN
// Compute PC for 32 CNs at once
for
(
j
=
0
;
j
<
10
;
j
++
)
{
// BN offset is units of 1*384/32 = 12
ymm0
=
p_cnProcBuf
[
j
*
12
+
i
];
ymm1
=
p_cnProcBufRes
[
j
*
12
+
i
];
// Add BN and input LLR, extract the sign bit
// and add in GF(2) (xor)
pcRes
^=
_mm256_movemask_epi8
(
_mm256_adds_epi8
(
ymm0
,
ymm1
));
}
// If no error pcRes should be 0
// Only use valid CNs
pcResSum
|=
(
pcRes
&
(
0xFFFFFFFF
>>
(
32
-
Mrem
)));
// If PC failed we can stop here
if
(
pcResSum
>
0
)
{
return
pcResSum
;
}
}
// =====================================================================
// Process group with 19 BNs
if
(
lut_numCnInCnGroups
[
8
]
>
0
)
{
// Reset results
pcResSum
=
0
;
// Number of CNs in group
M
=
lut_numCnInCnGroups
[
8
]
*
Z
;
// Remainder modulo 32
Mrem
=
M
&
31
;
// Number of groups of 32 CNs for parallel processing
// Ceil for values not divisible by 32
M32
=
(
M
+
31
)
>>
5
;
// Set pointers to start of group 19
p_cnProcBuf
=
(
__m256i
*
)
&
cnProcBuf
[
lut_startAddrCnGroups
[
8
]];
p_cnProcBufRes
=
(
__m256i
*
)
&
cnProcBufRes
[
lut_startAddrCnGroups
[
8
]];
// Loop over CNs
for
(
i
=
0
;
i
<
(
M32
-
1
);
i
++
)
{
pcRes
=
0
;
// Loop over every BN (Last BN is connected to multiple CNs)
// Compute PC for 32 CNs at once
for
(
j
=
0
;
j
<
19
;
j
++
)
{
// BN offset is units of 4*384/32 = 48
ymm0
=
p_cnProcBuf
[
j
*
48
+
i
];
ymm1
=
p_cnProcBufRes
[
j
*
48
+
i
];
// Add BN and input LLR, extract the sign bit
// and add in GF(2) (xor)
pcRes
^=
_mm256_movemask_epi8
(
_mm256_adds_epi8
(
ymm0
,
ymm1
));
}
// If no error pcRes should be 0
pcResSum
|=
pcRes
;
}
// Last 32 CNs might not be full valid 32 depending on Z
pcRes
=
0
;
// Loop over every BN (Last BN is connected to multiple CNs)
// Compute PC for 32 CNs at once
for
(
j
=
0
;
j
<
19
;
j
++
)
{
// BN offset is units of 4*384/32 = 48
ymm0
=
p_cnProcBuf
[
j
*
48
+
i
];
ymm1
=
p_cnProcBufRes
[
j
*
48
+
i
];
// Add BN and input LLR, extract the sign bit
// and add in GF(2) (xor)
pcRes
^=
_mm256_movemask_epi8
(
_mm256_adds_epi8
(
ymm0
,
ymm1
));
}
// If no error pcRes should be 0
// Only use valid CNs
pcResSum
|=
(
pcRes
&
(
0xFFFFFFFF
>>
(
32
-
Mrem
)));
// If PC failed we can stop here
if
(
pcResSum
>
0
)
{
return
pcResSum
;
}
}
return
pcResSum
;
}
/**
\brief Performs parity check for BG2 on the CN processing buffer. Stops as soon as error is detected.
\param p_lut Pointer to decoder LUTs
\param Z Lifting size
\return 32-bit parity check indicator
*/
static
inline
uint32_t
nrLDPC_cnProcPc_BG2
(
t_nrLDPC_lut
*
p_lut
,
t_nrLDPC_procBuf
*
p_procBuf
,
uint16_t
Z
)
{
const
uint8_t
*
lut_numCnInCnGroups
=
p_lut
->
numCnInCnGroups
;
const
uint32_t
*
lut_startAddrCnGroups
=
p_lut
->
startAddrCnGroups
;
int8_t
*
cnProcBuf
=
p_procBuf
->
cnProcBuf
;
int8_t
*
cnProcBufRes
=
p_procBuf
->
cnProcBufRes
;
__m256i
*
p_cnProcBuf
;
__m256i
*
p_cnProcBufRes
;
// Number of CNs in Groups
uint32_t
M
;
uint32_t
i
;
uint32_t
j
;
uint32_t
pcRes
=
0
;
uint32_t
pcResSum
=
0
;
uint32_t
Mrem
;
uint32_t
M32
;
__m256i
ymm0
,
ymm1
;
// =====================================================================
// Process group with 3 BNs
if
(
lut_numCnInCnGroups
[
0
]
>
0
)
{
// Reset results
pcResSum
=
0
;
// Number of CNs in group
M
=
lut_numCnInCnGroups
[
0
]
*
Z
;
// Remainder modulo 32
Mrem
=
M
&
31
;
// Number of groups of 32 CNs for parallel processing
// Ceil for values not divisible by 32
M32
=
(
M
+
31
)
>>
5
;
// Set pointers to start of group 3
p_cnProcBuf
=
(
__m256i
*
)
&
cnProcBuf
[
lut_startAddrCnGroups
[
0
]];
p_cnProcBufRes
=
(
__m256i
*
)
&
cnProcBufRes
[
lut_startAddrCnGroups
[
0
]];
// Loop over CNs
for
(
i
=
0
;
i
<
(
M32
-
1
);
i
++
)
{
pcRes
=
0
;
// Loop over every BN
// Compute PC for 32 CNs at once
for
(
j
=
0
;
j
<
3
;
j
++
)
{
// BN offset is units of (6*384/32) = 72
ymm0
=
p_cnProcBuf
[
j
*
72
+
i
];
ymm1
=
p_cnProcBufRes
[
j
*
72
+
i
];
// Add BN and input LLR, extract the sign bit
// and add in GF(2) (xor)
pcRes
^=
_mm256_movemask_epi8
(
_mm256_adds_epi8
(
ymm0
,
ymm1
));
}
// If no error pcRes should be 0
pcResSum
|=
pcRes
;
}
// Last 32 CNs might not be full valid 32 depending on Z
pcRes
=
0
;
// Loop over every BN
// Compute PC for 32 CNs at once
for
(
j
=
0
;
j
<
3
;
j
++
)
{
// BN offset is units of (6*384/32) = 72
ymm0
=
p_cnProcBuf
[
j
*
72
+
i
];
ymm1
=
p_cnProcBufRes
[
j
*
72
+
i
];
// Add BN and input LLR, extract the sign bit
// and add in GF(2) (xor)
pcRes
^=
_mm256_movemask_epi8
(
_mm256_adds_epi8
(
ymm0
,
ymm1
));
}
// If no error pcRes should be 0
// Only use valid CNs
pcResSum
|=
(
pcRes
&
(
0xFFFFFFFF
>>
(
32
-
Mrem
)));
// If PC failed we can stop here
if
(
pcResSum
>
0
)
{
return
pcResSum
;
}
}
// =====================================================================
// Process group with 4 BNs
if
(
lut_numCnInCnGroups
[
1
]
>
0
)
{
// Reset results
pcResSum
=
0
;
// Number of CNs in group
M
=
lut_numCnInCnGroups
[
1
]
*
Z
;
// Remainder modulo 32
Mrem
=
M
&
31
;
// Number of groups of 32 CNs for parallel processing
// Ceil for values not divisible by 32
M32
=
(
M
+
31
)
>>
5
;
// Set pointers to start of group 4
p_cnProcBuf
=
(
__m256i
*
)
&
cnProcBuf
[
lut_startAddrCnGroups
[
1
]];
p_cnProcBufRes
=
(
__m256i
*
)
&
cnProcBufRes
[
lut_startAddrCnGroups
[
1
]];
// Loop over CNs
for
(
i
=
0
;
i
<
(
M32
-
1
);
i
++
)
{
pcRes
=
0
;
// Loop over every BN
// Compute PC for 32 CNs at once
for
(
j
=
0
;
j
<
4
;
j
++
)
{
// BN offset is units of 20*384/32 = 240
ymm0
=
p_cnProcBuf
[
j
*
240
+
i
];
ymm1
=
p_cnProcBufRes
[
j
*
240
+
i
];
// Add BN and input LLR, extract the sign bit
// and add in GF(2) (xor)
pcRes
^=
_mm256_movemask_epi8
(
_mm256_adds_epi8
(
ymm0
,
ymm1
));
}
// If no error pcRes should be 0
pcResSum
|=
pcRes
;
}
// Last 32 CNs might not be full valid 32 depending on Z
pcRes
=
0
;
// Loop over every BN
// Compute PC for 32 CNs at once
for
(
j
=
0
;
j
<
4
;
j
++
)
{
// BN offset is units of 20*384/32 = 240
ymm0
=
p_cnProcBuf
[
j
*
240
+
i
];
ymm1
=
p_cnProcBufRes
[
j
*
240
+
i
];
// Add BN and input LLR, extract the sign bit
// and add in GF(2) (xor)
pcRes
^=
_mm256_movemask_epi8
(
_mm256_adds_epi8
(
ymm0
,
ymm1
));
}
// If no error pcRes should be 0
// Only use valid CNs
pcResSum
|=
(
pcRes
&
(
0xFFFFFFFF
>>
(
32
-
Mrem
)));
// If PC failed we can stop here
if
(
pcResSum
>
0
)
{
return
pcResSum
;
}
}
// =====================================================================
// Process group with 5 BNs
if
(
lut_numCnInCnGroups
[
2
]
>
0
)
{
// Reset results
pcResSum
=
0
;
// Number of CNs in group
M
=
lut_numCnInCnGroups
[
2
]
*
Z
;
// Remainder modulo 32
Mrem
=
M
&
31
;
// Number of groups of 32 CNs for parallel processing
// Ceil for values not divisible by 32
M32
=
(
M
+
31
)
>>
5
;
// Set pointers to start of group 5
p_cnProcBuf
=
(
__m256i
*
)
&
cnProcBuf
[
lut_startAddrCnGroups
[
2
]];
p_cnProcBufRes
=
(
__m256i
*
)
&
cnProcBufRes
[
lut_startAddrCnGroups
[
2
]];
// Loop over CNs
for
(
i
=
0
;
i
<
(
M32
-
1
);
i
++
)
{
pcRes
=
0
;
// Loop over every BN
// Compute PC for 32 CNs at once
for
(
j
=
0
;
j
<
5
;
j
++
)
{
// BN offset is units of 9*384/32 = 108
ymm0
=
p_cnProcBuf
[
j
*
108
+
i
];
ymm1
=
p_cnProcBufRes
[
j
*
108
+
i
];
// Add BN and input LLR, extract the sign bit
// and add in GF(2) (xor)
pcRes
^=
_mm256_movemask_epi8
(
_mm256_adds_epi8
(
ymm0
,
ymm1
));
}
// If no error pcRes should be 0
pcResSum
|=
pcRes
;
}
// Last 32 CNs might not be full valid 32 depending on Z
pcRes
=
0
;
// Loop over every BN
// Compute PC for 32 CNs at once
for
(
j
=
0
;
j
<
5
;
j
++
)
{
// BN offset is units of 9*384/32 = 108
ymm0
=
p_cnProcBuf
[
j
*
108
+
i
];
ymm1
=
p_cnProcBufRes
[
j
*
108
+
i
];
// Add BN and input LLR, extract the sign bit
// and add in GF(2) (xor)
pcRes
^=
_mm256_movemask_epi8
(
_mm256_adds_epi8
(
ymm0
,
ymm1
));
}
// If no error pcRes should be 0
// Only use valid CNs
pcResSum
|=
(
pcRes
&
(
0xFFFFFFFF
>>
(
32
-
Mrem
)));
// If PC failed we can stop here
if
(
pcResSum
>
0
)
{
return
pcResSum
;
}
}
// =====================================================================
// Process group with 6 BNs
if
(
lut_numCnInCnGroups
[
3
]
>
0
)
{
// Reset results
pcResSum
=
0
;
// Number of CNs in group
M
=
lut_numCnInCnGroups
[
3
]
*
Z
;
// Remainder modulo 32
Mrem
=
M
&
31
;
// Number of groups of 32 CNs for parallel processing
// Ceil for values not divisible by 32
M32
=
(
M
+
31
)
>>
5
;
// Set pointers to start of group 6
p_cnProcBuf
=
(
__m256i
*
)
&
cnProcBuf
[
lut_startAddrCnGroups
[
3
]];
p_cnProcBufRes
=
(
__m256i
*
)
&
cnProcBufRes
[
lut_startAddrCnGroups
[
3
]];
// Loop over CNs
for
(
i
=
0
;
i
<
(
M32
-
1
);
i
++
)
{
pcRes
=
0
;
// Loop over every BN
// Compute PC for 32 CNs at once
for
(
j
=
0
;
j
<
6
;
j
++
)
{
// BN offset is units of 3*384/32 = 36
ymm0
=
p_cnProcBuf
[
j
*
36
+
i
];
ymm1
=
p_cnProcBufRes
[
j
*
36
+
i
];
// Add BN and input LLR, extract the sign bit
// and add in GF(2) (xor)
pcRes
^=
_mm256_movemask_epi8
(
_mm256_adds_epi8
(
ymm0
,
ymm1
));
}
// If no error pcRes should be 0
pcResSum
|=
pcRes
;
}
// Last 32 CNs might not be full valid 32 depending on Z
pcRes
=
0
;
// Loop over every BN
// Compute PC for 32 CNs at once
for
(
j
=
0
;
j
<
6
;
j
++
)
{
// BN offset is units of 3*384/32 = 36
ymm0
=
p_cnProcBuf
[
j
*
36
+
i
];
ymm1
=
p_cnProcBufRes
[
j
*
36
+
i
];
// Add BN and input LLR, extract the sign bit
// and add in GF(2) (xor)
pcRes
^=
_mm256_movemask_epi8
(
_mm256_adds_epi8
(
ymm0
,
ymm1
));
}
// If no error pcRes should be 0
// Only use valid CNs
pcResSum
|=
(
pcRes
&
(
0xFFFFFFFF
>>
(
32
-
Mrem
)));
// If PC failed we can stop here
if
(
pcResSum
>
0
)
{
return
pcResSum
;
}
}
// =====================================================================
// Process group with 8 BNs
if
(
lut_numCnInCnGroups
[
4
]
>
0
)
{
// Reset results
pcResSum
=
0
;
// Number of CNs in group
M
=
lut_numCnInCnGroups
[
4
]
*
Z
;
// Remainder modulo 32
Mrem
=
M
&
31
;
// Number of groups of 32 CNs for parallel processing
// Ceil for values not divisible by 32
M32
=
(
M
+
31
)
>>
5
;
// Set pointers to start of group 8
p_cnProcBuf
=
(
__m256i
*
)
&
cnProcBuf
[
lut_startAddrCnGroups
[
4
]];
p_cnProcBufRes
=
(
__m256i
*
)
&
cnProcBufRes
[
lut_startAddrCnGroups
[
4
]];
// Loop over CNs
for
(
i
=
0
;
i
<
(
M32
-
1
);
i
++
)
{
pcRes
=
0
;
// Loop over every BN
// Compute PC for 32 CNs at once
for
(
j
=
0
;
j
<
8
;
j
++
)
{
// BN offset is units of 2*384/32 = 24
ymm0
=
p_cnProcBuf
[
j
*
24
+
i
];
ymm1
=
p_cnProcBufRes
[
j
*
24
+
i
];
// Add BN and input LLR, extract the sign bit
// and add in GF(2) (xor)
pcRes
^=
_mm256_movemask_epi8
(
_mm256_adds_epi8
(
ymm0
,
ymm1
));
}
// If no error pcRes should be 0
pcResSum
|=
pcRes
;
}
// Last 32 CNs might not be full valid 32 depending on Z
pcRes
=
0
;
// Loop over every BN
// Compute PC for 32 CNs at once
for
(
j
=
0
;
j
<
8
;
j
++
)
{
// BN offset is units of 2*384/32 = 24
ymm0
=
p_cnProcBuf
[
j
*
24
+
i
];
ymm1
=
p_cnProcBufRes
[
j
*
24
+
i
];
// Add BN and input LLR, extract the sign bit
// and add in GF(2) (xor)
pcRes
^=
_mm256_movemask_epi8
(
_mm256_adds_epi8
(
ymm0
,
ymm1
));
}
// If no error pcRes should be 0
// Only use valid CNs
pcResSum
|=
(
pcRes
&
(
0xFFFFFFFF
>>
(
32
-
Mrem
)));
// If PC failed we can stop here
if
(
pcResSum
>
0
)
{
return
pcResSum
;
}
}
// =====================================================================
// Process group with 10 BNs
if
(
lut_numCnInCnGroups
[
5
]
>
0
)
{
// Reset results
pcResSum
=
0
;
// Number of CNs in group
M
=
lut_numCnInCnGroups
[
5
]
*
Z
;
// Remainder modulo 32
Mrem
=
M
&
31
;
// Number of groups of 32 CNs for parallel processing
// Ceil for values not divisible by 32
M32
=
(
M
+
31
)
>>
5
;
// Set pointers to start of group 10
p_cnProcBuf
=
(
__m256i
*
)
&
cnProcBuf
[
lut_startAddrCnGroups
[
5
]];
p_cnProcBufRes
=
(
__m256i
*
)
&
cnProcBufRes
[
lut_startAddrCnGroups
[
5
]];
// Loop over CNs
for
(
i
=
0
;
i
<
(
M32
-
1
);
i
++
)
{
pcRes
=
0
;
// Loop over every BN
// Compute PC for 32 CNs at once
for
(
j
=
0
;
j
<
10
;
j
++
)
{
// BN offset is units of 2*384/32 = 24
ymm0
=
p_cnProcBuf
[
j
*
24
+
i
];
ymm1
=
p_cnProcBufRes
[
j
*
24
+
i
];
// Add BN and input LLR, extract the sign bit
// and add in GF(2) (xor)
pcRes
^=
_mm256_movemask_epi8
(
_mm256_adds_epi8
(
ymm0
,
ymm1
));
}
// If no error pcRes should be 0
pcResSum
|=
pcRes
;
}
// Last 32 CNs might not be full valid 32 depending on Z
pcRes
=
0
;
// Loop over every BN
// Compute PC for 32 CNs at once
for
(
j
=
0
;
j
<
10
;
j
++
)
{
// BN offset is units of 2*384/32 = 24
ymm0
=
p_cnProcBuf
[
j
*
24
+
i
];
ymm1
=
p_cnProcBufRes
[
j
*
24
+
i
];
// Add BN and input LLR, extract the sign bit
// and add in GF(2) (xor)
pcRes
^=
_mm256_movemask_epi8
(
_mm256_adds_epi8
(
ymm0
,
ymm1
));
}
// If no error pcRes should be 0
// Only use valid CNs
pcResSum
|=
(
pcRes
&
(
0xFFFFFFFF
>>
(
32
-
Mrem
)));
// If PC failed we can stop here
if
(
pcResSum
>
0
)
{
return
pcResSum
;
}
}
return
pcResSum
;
}
#endif
openair1/PHY/CODING/nrLDPC_decoder/nrLDPC_decoder.c
View file @
f6bb869c
...
...
@@ -33,8 +33,10 @@
#include "nrLDPC_mPass.h"
#include "nrLDPC_cnProc.h"
#include "nrLDPC_bnProc.h"
#define UNROLL_CN_PROC 1
#define UNROLL_BN_PROC 1
#define UNROLL_BN_PROC_PC 1
#define UNROLL_BN2CN_PROC 1
/*----------------------------------------------------------------------
| cn processing files -->AVX512
/----------------------------------------------------------------------*/
...
...
@@ -197,14 +199,8 @@ static inline uint32_t nrLDPC_decoder_core(int8_t* p_llr, int8_t* p_out, uint32_
#ifdef NR_LDPC_PROFILER_DETAIL
start_meas
(
&
p_profiler
->
llr2CnProcBuf
);
#endif
if
(
BG
==
1
)
{
nrLDPC_llr2CnProcBuf_BG1
(
p_lut
,
p_llr
,
cnProcBuf
,
Z
);
}
else
{
nrLDPC_llr2CnProcBuf_BG2
(
p_lut
,
p_llr
,
cnProcBuf
,
Z
);
}
if
(
BG
==
1
)
nrLDPC_llr2CnProcBuf_BG1
(
p_lut
,
p_llr
,
cnProcBuf
,
Z
);
else
nrLDPC_llr2CnProcBuf_BG2
(
p_lut
,
p_llr
,
cnProcBuf
,
Z
);
#ifdef NR_LDPC_PROFILER_DETAIL
stop_meas
(
&
p_profiler
->
llr2CnProcBuf
);
#endif
...
...
@@ -220,8 +216,10 @@ static inline uint32_t nrLDPC_decoder_core(int8_t* p_llr, int8_t* p_out, uint32_
#ifdef NR_LDPC_PROFILER_DETAIL
start_meas
(
&
p_profiler
->
cnProc
);
#endif
if
(
BG
==
1
)
{
if
(
BG
==
1
)
{
#ifndef UNROLL_CN_PROC
nrLDPC_cnProc_BG1
(
p_lut
,
cnProcBuf
,
cnProcBufRes
,
Z
);
#else
switch
(
R
)
{
case
13
:
...
...
@@ -232,7 +230,7 @@ static inline uint32_t nrLDPC_decoder_core(int8_t* p_llr, int8_t* p_out, uint32_
nrLDPC_cnProc_BG1_R13_AVX2
(
cnProcBuf
,
cnProcBufRes
,
Z
);
#endif
break
;
}
}
case
23
:
{
...
...
@@ -242,7 +240,7 @@ static inline uint32_t nrLDPC_decoder_core(int8_t* p_llr, int8_t* p_out, uint32_
nrLDPC_cnProc_BG1_R23_AVX2
(
cnProcBuf
,
cnProcBufRes
,
Z
);
#endif
break
;
}
}
case
89
:
{
...
...
@@ -252,14 +250,15 @@ static inline uint32_t nrLDPC_decoder_core(int8_t* p_llr, int8_t* p_out, uint32_
nrLDPC_cnProc_BG1_R89_AVX2
(
cnProcBuf
,
cnProcBufRes
,
Z
);
#endif
break
;
}
}
}
}
else
{
switch
(
R
)
{
#endif
}
else
{
#ifndef UNROLL_CN_PROC
nrLDPC_cnProc_BG2
(
p_lut
,
cnProcBuf
,
cnProcBufRes
,
Z
);
#else
switch
(
R
)
{
case
15
:
{
#ifdef __AVX512BW__
...
...
@@ -268,8 +267,7 @@ static inline uint32_t nrLDPC_decoder_core(int8_t* p_llr, int8_t* p_out, uint32_
nrLDPC_cnProc_BG2_R15_AVX2
(
cnProcBuf
,
cnProcBufRes
,
Z
);
#endif
break
;
}
}
case
13
:
{
#ifdef __AVX512BW__
...
...
@@ -278,8 +276,7 @@ static inline uint32_t nrLDPC_decoder_core(int8_t* p_llr, int8_t* p_out, uint32_
nrLDPC_cnProc_BG2_R13_AVX2
(
cnProcBuf
,
cnProcBufRes
,
Z
);
#endif
break
;
}
}
case
23
:
{
#ifdef __AVX512BW__
...
...
@@ -288,10 +285,11 @@ static inline uint32_t nrLDPC_decoder_core(int8_t* p_llr, int8_t* p_out, uint32_
nrLDPC_cnProc_BG2_R23_AVX2
(
cnProcBuf
,
cnProcBufRes
,
Z
);
#endif
break
;
}
}
}
}
#endif
}
#ifdef NR_LDPC_PROFILER_DETAIL
stop_meas
(
&
p_profiler
->
cnProc
);
#endif
...
...
@@ -304,14 +302,8 @@ static inline uint32_t nrLDPC_decoder_core(int8_t* p_llr, int8_t* p_out, uint32_
#ifdef NR_LDPC_PROFILER_DETAIL
start_meas
(
&
p_profiler
->
cn2bnProcBuf
);
#endif
if
(
BG
==
1
)
{
nrLDPC_cn2bnProcBuf_BG1
(
p_lut
,
cnProcBufRes
,
bnProcBuf
,
Z
);
}
else
{
nrLDPC_cn2bnProcBuf_BG2
(
p_lut
,
cnProcBufRes
,
bnProcBuf
,
Z
);
}
if
(
BG
==
1
)
nrLDPC_cn2bnProcBuf_BG1
(
p_lut
,
cnProcBufRes
,
bnProcBuf
,
Z
);
else
nrLDPC_cn2bnProcBuf_BG2
(
p_lut
,
cnProcBufRes
,
bnProcBuf
,
Z
);
#ifdef NR_LDPC_PROFILER_DETAIL
stop_meas
(
&
p_profiler
->
cn2bnProcBuf
);
#endif
...
...
@@ -326,12 +318,12 @@ static inline uint32_t nrLDPC_decoder_core(int8_t* p_llr, int8_t* p_out, uint32_
start_meas
(
&
p_profiler
->
bnProcPc
);
#endif
//nrLDPC_bnProcPc(p_lut, p_procBuf, Z);
if
(
BG
==
1
)
{
switch
(
R
)
{
#ifndef UNROLL_BN_PROC_PC
nrLDPC_bnProcPc
(
p_lut
,
bnProcBuf
,
bnProcBufRes
,
llrProcBuf
,
llrRes
,
Z
);
#else
if
(
BG
==
1
)
{
switch
(
R
)
{
case
13
:
{
nrLDPC_bnProcPc_BG1_R13_AVX2
(
bnProcBuf
,
bnProcBufRes
,
llrRes
,
llrProcBuf
,
Z
);
...
...
@@ -348,34 +340,27 @@ static inline uint32_t nrLDPC_decoder_core(int8_t* p_llr, int8_t* p_out, uint32_
break
;
}
}
}
else
{
switch
(
R
)
{
}
else
{
switch
(
R
)
{
case
15
:
{
nrLDPC_bnProcPc_BG2_R15_AVX2
(
bnProcBuf
,
bnProcBufRes
,
llrRes
,
llrProcBuf
,
Z
);
break
;
}
case
13
:
{
nrLDPC_bnProcPc_BG2_R13_AVX2
(
bnProcBuf
,
bnProcBufRes
,
llrRes
,
llrProcBuf
,
Z
);
break
;
}
case
23
:
{
nrLDPC_bnProcPc_BG2_R23_AVX2
(
bnProcBuf
,
bnProcBufRes
,
llrRes
,
llrProcBuf
,
Z
);
break
;
}
}
}
}
#endif
#ifdef NR_LDPC_PROFILER_DETAIL
stop_meas
(
&
p_profiler
->
bnProcPc
);
...
...
@@ -389,13 +374,12 @@ static inline uint32_t nrLDPC_decoder_core(int8_t* p_llr, int8_t* p_out, uint32_
#ifdef NR_LDPC_PROFILER_DETAIL
start_meas
(
&
p_profiler
->
bnProc
);
#endif
// nrLDPC_bnProc(p_lut, p_procBuf, Z);
if
(
BG
==
1
)
{
switch
(
R
)
{
if
(
BG
==
1
)
{
#ifndef UNROLL_BN_PROC
nrLDPC_bnProc
(
p_lut
,
bnProcBuf
,
bnProcBufRes
,
llrRes
,
Z
);
#else
switch
(
R
)
{
case
13
:
{
#ifdef __AVX512BW__
...
...
@@ -424,11 +408,12 @@ if (BG==1)
break
;
}
}
}
else
{
switch
(
R
)
{
#endif
}
else
{
#ifndef UNROLL_BN2CN_PROC
nrLDPC_bn2cnProcBuf_BG2
(
p_lut
,
bnProcBufRes
,
cnProcBuf
,
Z
);
#else
switch
(
R
)
{
case
15
:
{
#ifdef __AVX512BW__
...
...
@@ -439,7 +424,6 @@ if (BG==1)
break
;
}
case
13
:
{
#ifdef __AVX512BW__
nrLDPC_bnProc_BG2_R13_AVX512
(
bnProcBuf
,
bnProcBufRes
,
llrRes
,
Z
);
...
...
@@ -450,7 +434,6 @@ if (BG==1)
}
case
23
:
{
#ifdef __AVX512BW__
nrLDPC_bnProc_BG2_R23_AVX512
(
bnProcBuf
,
bnProcBufRes
,
llrRes
,
Z
);
...
...
@@ -459,9 +442,8 @@ if (BG==1)
#endif
break
;
}
}
#endif
}
#ifdef NR_LDPC_PROFILER_DETAIL
...
...
@@ -477,14 +459,8 @@ if (BG==1)
#ifdef NR_LDPC_PROFILER_DETAIL
start_meas
(
&
p_profiler
->
bn2cnProcBuf
);
#endif
if
(
BG
==
1
)
{
nrLDPC_bn2cnProcBuf_BG1
(
p_lut
,
bnProcBufRes
,
cnProcBuf
,
Z
);
}
else
{
nrLDPC_bn2cnProcBuf_BG2
(
p_lut
,
bnProcBufRes
,
cnProcBuf
,
Z
);
}
if
(
BG
==
1
)
nrLDPC_bn2cnProcBuf_BG1
(
p_lut
,
bnProcBufRes
,
cnProcBuf
,
Z
);
else
nrLDPC_bn2cnProcBuf_BG2
(
p_lut
,
bnProcBufRes
,
cnProcBuf
,
Z
);
#ifdef NR_LDPC_PROFILER_DETAIL
stop_meas
(
&
p_profiler
->
bn2cnProcBuf
);
#endif
...
...
@@ -499,8 +475,7 @@ if (BG==1)
// First iteration finished
while
(
(
i
<
numMaxIter
)
&&
(
pcRes
!=
0
)
)
{
while
(
(
i
<
numMaxIter
)
&&
(
pcRes
!=
0
)
)
{
// Increase iteration counter
i
++
;
...
...
@@ -508,10 +483,11 @@ if (BG==1)
#ifdef NR_LDPC_PROFILER_DETAIL
start_meas
(
&
p_profiler
->
cnProc
);
#endif
if
(
BG
==
1
)
{
switch
(
R
)
{
if
(
BG
==
1
)
{
#ifndef UNROLL_CN_PROC
nrLDPC_cnProc_BG1
(
p_lut
,
cnProcBuf
,
cnProcBufRes
,
Z
);
#else
switch
(
R
)
{
case
13
:
{
#ifdef __AVX512BW__
...
...
@@ -520,8 +496,7 @@ if (BG==1)
nrLDPC_cnProc_BG1_R13_AVX2
(
cnProcBuf
,
cnProcBufRes
,
Z
);
#endif
break
;
}
}
case
23
:
{
#ifdef __AVX512BW__
...
...
@@ -530,8 +505,7 @@ if (BG==1)
nrLDPC_cnProc_BG1_R23_AVX2
(
cnProcBuf
,
cnProcBufRes
,
Z
);
#endif
break
;
}
}
case
89
:
{
#ifdef __AVX512BW__
...
...
@@ -540,14 +514,14 @@ if (BG==1)
nrLDPC_cnProc_BG1_R89_AVX2
(
cnProcBuf
,
cnProcBufRes
,
Z
);
#endif
break
;
}
}
}
}
else
{
switch
(
R
)
{
#endif
}
else
{
#ifndef UNROLL_CN_PROC
nrLDPC_cnProc_BG2
(
p_lut
,
cnProcBuf
,
cnProcBufRes
,
Z
);
#else
switch
(
R
)
{
case
15
:
{
#ifdef __AVX512BW__
...
...
@@ -556,8 +530,7 @@ if (BG==1)
nrLDPC_cnProc_BG2_R15_AVX2
(
cnProcBuf
,
cnProcBufRes
,
Z
);
#endif
break
;
}
}
case
13
:
{
#ifdef __AVX512BW__
...
...
@@ -566,8 +539,7 @@ if (BG==1)
nrLDPC_cnProc_BG2_R13_AVX2
(
cnProcBuf
,
cnProcBufRes
,
Z
);
#endif
break
;
}
}
case
23
:
{
#ifdef __AVX512BW__
...
...
@@ -576,9 +548,10 @@ if (BG==1)
nrLDPC_cnProc_BG2_R23_AVX2
(
cnProcBuf
,
cnProcBufRes
,
Z
);
#endif
break
;
}
}
}
#endif
}
}
#ifdef NR_LDPC_PROFILER_DETAIL
stop_meas
(
&
p_profiler
->
cnProc
);
#endif
...
...
@@ -591,14 +564,8 @@ if (BG==1)
#ifdef NR_LDPC_PROFILER_DETAIL
start_meas
(
&
p_profiler
->
cn2bnProcBuf
);
#endif
if
(
BG
==
1
)
{
nrLDPC_cn2bnProcBuf_BG1
(
p_lut
,
cnProcBufRes
,
bnProcBuf
,
Z
);
}
else
{
nrLDPC_cn2bnProcBuf_BG2
(
p_lut
,
cnProcBufRes
,
bnProcBuf
,
Z
);
}
if
(
BG
==
1
)
nrLDPC_cn2bnProcBuf_BG1
(
p_lut
,
cnProcBufRes
,
bnProcBuf
,
Z
);
else
nrLDPC_cn2bnProcBuf_BG2
(
p_lut
,
cnProcBufRes
,
bnProcBuf
,
Z
);
#ifdef NR_LDPC_PROFILER_DETAIL
stop_meas
(
&
p_profiler
->
cn2bnProcBuf
);
#endif
...
...
@@ -612,11 +579,11 @@ if (BG==1)
start_meas
(
&
p_profiler
->
bnProcPc
);
#endif
//nrLDPC_bnProcPc(p_lut, p_procBuf, Z);
if
(
BG
==
1
)
{
switch
(
R
)
{
#ifndef UNROLL_BN_PROC_PC
nrLDPC_bnProcPc
(
p_lut
,
bnProcBuf
,
bnProcBufRes
,
llrProcBuf
,
llrRes
,
Z
);
#else
if
(
BG
==
1
)
{
switch
(
R
)
{
case
13
:
{
nrLDPC_bnProcPc_BG1_R13_AVX2
(
bnProcBuf
,
bnProcBufRes
,
llrRes
,
llrProcBuf
,
Z
);
...
...
@@ -632,36 +599,28 @@ if (BG==1)
nrLDPC_bnProcPc_BG1_R89_AVX2
(
bnProcBuf
,
bnProcBufRes
,
llrRes
,
llrProcBuf
,
Z
);
break
;
}
}
}
else
{
switch
(
R
)
{
}
}
else
{
switch
(
R
)
{
case
15
:
{
nrLDPC_bnProcPc_BG2_R15_AVX2
(
bnProcBuf
,
bnProcBufRes
,
llrRes
,
llrProcBuf
,
Z
);
break
;
}
case
13
:
case
13
:
{
nrLDPC_bnProcPc_BG2_R13_AVX2
(
bnProcBuf
,
bnProcBufRes
,
llrRes
,
llrProcBuf
,
Z
);
break
;
}
case
23
:
{
nrLDPC_bnProcPc_BG2_R23_AVX2
(
bnProcBuf
,
bnProcBufRes
,
llrRes
,
llrProcBuf
,
Z
);
break
;
}
}
}
}
#endif
#ifdef NR_LDPC_PROFILER_DETAIL
stop_meas
(
&
p_profiler
->
bnProcPc
);
#endif
...
...
@@ -673,13 +632,11 @@ if (BG==1)
#ifdef NR_LDPC_PROFILER_DETAIL
start_meas
(
&
p_profiler
->
bnProc
);
#endif
// nrLDPC_bnProc(p_lut, p_procBuf, Z);
if
(
BG
==
1
)
{
switch
(
R
)
{
#ifndef UNROLL_BN_PROC
nrLDPC_bnProc
(
p_lut
,
bnProcBuf
,
bnProcBufRes
,
llrRes
,
Z
);
#else
if
(
BG
==
1
)
{
switch
(
R
)
{
case
13
:
{
#ifdef __AVX512BW__
...
...
@@ -707,12 +664,10 @@ if (BG==1)
#endif
break
;
}
}
}
else
{
switch
(
R
)
{
}
}
else
{
switch
(
R
)
{
case
15
:
{
#ifdef __AVX512BW__
...
...
@@ -723,7 +678,6 @@ if (BG==1)
break
;
}
case
13
:
{
#ifdef __AVX512BW__
nrLDPC_bnProc_BG2_R13_AVX512
(
bnProcBuf
,
bnProcBufRes
,
llrRes
,
Z
);
...
...
@@ -732,9 +686,7 @@ if (BG==1)
#endif
break
;
}
case
23
:
{
#ifdef __AVX512BW__
nrLDPC_bnProc_BG2_R23_AVX512
(
bnProcBuf
,
bnProcBufRes
,
llrRes
,
Z
);
...
...
@@ -743,10 +695,9 @@ if (BG==1)
#endif
break
;
}
}
}
}
#endif
...
...
@@ -762,14 +713,8 @@ if (BG==1)
#ifdef NR_LDPC_PROFILER_DETAIL
start_meas
(
&
p_profiler
->
bn2cnProcBuf
);
#endif
if
(
BG
==
1
)
{
nrLDPC_bn2cnProcBuf_BG1
(
p_lut
,
bnProcBufRes
,
cnProcBuf
,
Z
);
}
else
{
nrLDPC_bn2cnProcBuf_BG2
(
p_lut
,
bnProcBufRes
,
cnProcBuf
,
Z
);
}
if
(
BG
==
1
)
nrLDPC_bn2cnProcBuf_BG1
(
p_lut
,
bnProcBufRes
,
cnProcBuf
,
Z
);
else
nrLDPC_bn2cnProcBuf_BG2
(
p_lut
,
bnProcBufRes
,
cnProcBuf
,
Z
);
#ifdef NR_LDPC_PROFILER_DETAIL
stop_meas
(
&
p_profiler
->
bn2cnProcBuf
);
#endif
...
...
@@ -778,360 +723,37 @@ if (BG==1)
nrLDPC_debug_writeBuffer2File
(
nrLDPC_buffers_CN_PROC
,
cnProcBuf
);
#endif
// Parity Check
// Parity Check
#ifdef NR_LDPC_ENABLE_PARITY_CHECK
#ifdef NR_LDPC_PROFILER_DETAIL
start_meas
(
&
p_profiler
->
cnProcPc
);
start_meas
(
&
p_profiler
->
cnProcPc
);
#endif
if
(
BG
==
1
)
{
pcRes
=
nrLDPC_cnProcPc_BG1
(
p_lut
,
cnProcBuf
,
cnProcBufRes
,
Z
);
}
else
{
pcRes
=
nrLDPC_cnProcPc_BG2
(
p_lut
,
cnProcBuf
,
cnProcBufRes
,
Z
);
}
if
(
BG
==
1
)
pcRes
=
nrLDPC_cnProcPc_BG1
(
p_lut
,
cnProcBuf
,
cnProcBufRes
,
Z
);
else
pcRes
=
nrLDPC_cnProcPc_BG2
(
p_lut
,
cnProcBuf
,
cnProcBufRes
,
Z
);
#ifdef NR_LDPC_PROFILER_DETAIL
stop_meas
(
&
p_profiler
->
cnProcPc
);
#endif
stop_meas
(
&
p_profiler
->
cnProcPc
);
#endif
}
// Last iteration
if
(
(
i
<
numMaxIter
)
&&
(
pcRes
!=
0
)
)
{
// Increase iteration counter
i
++
;
// CN processing
#ifdef NR_LDPC_PROFILER_DETAIL
start_meas
(
&
p_profiler
->
cnProc
);
#endif
if
(
BG
==
1
)
{
switch
(
R
)
{
case
13
:
{
#ifdef __AVX512BW__
nrLDPC_cnProc_BG1_R13_AVX512
(
cnProcBuf
,
cnProcBufRes
,
Z
);
#else
nrLDPC_cnProc_BG1_R13_AVX2
(
cnProcBuf
,
cnProcBufRes
,
Z
);
#endif
break
;
}
case
23
:
{
#ifdef __AVX512BW__
nrLDPC_cnProc_BG1_R23_AVX512
(
cnProcBuf
,
cnProcBufRes
,
Z
);
#else
nrLDPC_cnProc_BG1_R23_AVX2
(
cnProcBuf
,
cnProcBufRes
,
Z
);
#endif
break
;
}
case
89
:
{
#ifdef __AVX512BW__
nrLDPC_cnProc_BG1_R89_AVX512
(
cnProcBuf
,
cnProcBufRes
,
Z
);
#else
nrLDPC_cnProc_BG1_R89_AVX2
(
cnProcBuf
,
cnProcBufRes
,
Z
);
#endif
break
;
}
}
}
else
{
switch
(
R
)
{
case
15
:
{
#ifdef __AVX512BW__
nrLDPC_cnProc_BG2_R15_AVX512
(
cnProcBuf
,
cnProcBufRes
,
Z
);
#else
nrLDPC_cnProc_BG2_R15_AVX2
(
cnProcBuf
,
cnProcBufRes
,
Z
);
#endif
break
;
}
case
13
:
{
#ifdef __AVX512BW__
nrLDPC_cnProc_BG2_R13_AVX512
(
cnProcBuf
,
cnProcBufRes
,
Z
);
#else
nrLDPC_cnProc_BG2_R13_AVX2
(
cnProcBuf
,
cnProcBufRes
,
Z
);
#endif
break
;
}
case
23
:
{
#ifdef __AVX512BW__
nrLDPC_cnProc_BG2_R23_AVX512
(
cnProcBuf
,
cnProcBufRes
,
Z
);
#else
nrLDPC_cnProc_BG2_R23_AVX2
(
cnProcBuf
,
cnProcBufRes
,
Z
);
#endif
break
;
}
}
}
#ifdef NR_LDPC_PROFILER_DETAIL
stop_meas
(
&
p_profiler
->
cnProc
);
#endif
#ifdef NR_LDPC_DEBUG_MODE
nrLDPC_debug_writeBuffer2File
(
nrLDPC_buffers_CN_PROC_RES
,
p_procBuf
);
#endif
// Send CN results back to BNs
#ifdef NR_LDPC_PROFILER_DETAIL
start_meas
(
&
p_profiler
->
cn2bnProcBuf
);
#endif
if
(
BG
==
1
)
{
nrLDPC_cn2bnProcBuf_BG1
(
p_lut
,
cnProcBufRes
,
bnProcBuf
,
Z
);
}
else
{
nrLDPC_cn2bnProcBuf_BG2
(
p_lut
,
cnProcBufRes
,
bnProcBuf
,
Z
);
}
#ifdef NR_LDPC_PROFILER_DETAIL
stop_meas
(
&
p_profiler
->
cn2bnProcBuf
);
#endif
#ifdef NR_LDPC_DEBUG_MODE
nrLDPC_debug_writeBuffer2File
(
nrLDPC_buffers_BN_PROC
,
p_procBuf
);
#endif
// BN Processing
#ifdef NR_LDPC_PROFILER_DETAIL
start_meas
(
&
p_profiler
->
bnProcPc
);
#endif
// nrLDPC_bnProcPc(p_lut, p_procBuf, Z);
if
(
BG
==
1
)
{
switch
(
R
)
{
case
13
:
{
nrLDPC_bnProcPc_BG1_R13_AVX2
(
bnProcBuf
,
bnProcBufRes
,
llrRes
,
llrProcBuf
,
Z
);
break
;
}
case
23
:
{
nrLDPC_bnProcPc_BG1_R23_AVX2
(
bnProcBuf
,
bnProcBufRes
,
llrRes
,
llrProcBuf
,
Z
);
break
;
}
case
89
:
{
nrLDPC_bnProcPc_BG1_R89_AVX2
(
bnProcBuf
,
bnProcBufRes
,
llrRes
,
llrProcBuf
,
Z
);
break
;
}
}
}
else
{
switch
(
R
)
{
case
15
:
{
nrLDPC_bnProcPc_BG2_R15_AVX2
(
bnProcBuf
,
bnProcBufRes
,
llrRes
,
llrProcBuf
,
Z
);
break
;
}
case
13
:
{
nrLDPC_bnProcPc_BG2_R13_AVX2
(
bnProcBuf
,
bnProcBufRes
,
llrRes
,
llrProcBuf
,
Z
);
break
;
}
case
23
:
{
nrLDPC_bnProcPc_BG2_R23_AVX2
(
bnProcBuf
,
bnProcBufRes
,
llrRes
,
llrProcBuf
,
Z
);
break
;
}
}
}
#ifdef NR_LDPC_PROFILER_DETAIL
stop_meas
(
&
p_profiler
->
bnProcPc
);
#endif
#ifdef NR_LDPC_DEBUG_MODE
nrLDPC_debug_writeBuffer2File
(
nrLDPC_buffers_LLR_RES
,
p_procBuf
);
#endif
// If parity check not enabled, no need to send the BN proc results
// back to CNs
#ifdef NR_LDPC_ENABLE_PARITY_CHECK
#ifdef NR_LDPC_PROFILER_DETAIL
start_meas
(
&
p_profiler
->
bnProc
);
#endif
//nrLDPC_bnProc(p_lut, p_procBuf, Z);
if
(
BG
==
1
)
{
switch
(
R
)
{
case
13
:
{
#ifdef __AVX512BW__
nrLDPC_bnProc_BG1_R13_AVX512
(
bnProcBuf
,
bnProcBufRes
,
llrRes
,
Z
);
#else
nrLDPC_bnProc_BG1_R13_AVX2
(
bnProcBuf
,
bnProcBufRes
,
llrRes
,
Z
);
#endif
break
;
}
case
23
:
{
#ifdef __AVX512BW__
nrLDPC_bnProc_BG1_R23_AVX512
(
bnProcBuf
,
bnProcBufRes
,
llrRes
,
Z
);
#else
nrLDPC_bnProc_BG1_R23_AVX2
(
bnProcBuf
,
bnProcBufRes
,
llrRes
,
Z
);
#endif
break
;
}
case
89
:
{
#ifdef __AVX512BW__
nrLDPC_bnProc_BG1_R89_AVX512
(
bnProcBuf
,
bnProcBufRes
,
llrRes
,
Z
);
#else
nrLDPC_bnProc_BG1_R89_AVX2
(
bnProcBuf
,
bnProcBufRes
,
llrRes
,
Z
);
#endif
break
;
}
}
}
else
{
switch
(
R
)
{
case
15
:
{
#ifdef __AVX512BW__
nrLDPC_bnProc_BG2_R15_AVX512
(
bnProcBuf
,
bnProcBufRes
,
llrRes
,
Z
);
#else
nrLDPC_bnProc_BG2_R15_AVX2
(
bnProcBuf
,
bnProcBufRes
,
llrRes
,
Z
);
#endif
break
;
}
case
13
:
{
#ifdef __AVX512BW__
nrLDPC_bnProc_BG2_R13_AVX512
(
bnProcBuf
,
bnProcBufRes
,
llrRes
,
Z
);
#else
nrLDPC_bnProc_BG2_R13_AVX2
(
bnProcBuf
,
bnProcBufRes
,
llrRes
,
Z
);
#endif
break
;
}
case
23
:
{
#ifdef __AVX512BW__
nrLDPC_bnProc_BG2_R23_AVX512
(
bnProcBuf
,
bnProcBufRes
,
llrRes
,
Z
);
#else
nrLDPC_bnProc_BG2_R23_AVX2
(
bnProcBuf
,
bnProcBufRes
,
llrRes
,
Z
);
#endif
break
;
}
}
}
#ifdef NR_LDPC_PROFILER_DETAIL
stop_meas
(
&
p_profiler
->
bnProc
);
#endif
#ifdef NR_LDPC_DEBUG_MODE
nrLDPC_debug_writeBuffer2File
(
nrLDPC_buffers_BN_PROC_RES
,
p_procBuf
);
#endif
// BN results to CN processing buffer
#ifdef NR_LDPC_PROFILER_DETAIL
start_meas
(
&
p_profiler
->
bn2cnProcBuf
);
#endif
if
(
BG
==
1
)
{
nrLDPC_bn2cnProcBuf_BG1
(
p_lut
,
bnProcBufRes
,
cnProcBuf
,
Z
);
}
else
{
nrLDPC_bn2cnProcBuf_BG2
(
p_lut
,
bnProcBufRes
,
cnProcBuf
,
Z
);
}
#ifdef NR_LDPC_PROFILER_DETAIL
stop_meas
(
&
p_profiler
->
bn2cnProcBuf
);
#endif
#ifdef NR_LDPC_DEBUG_MODE
nrLDPC_debug_writeBuffer2File
(
nrLDPC_buffers_CN_PROC
,
p_procBuf
);
#endif
// Parity Check
#ifdef NR_LDPC_PROFILER_DETAIL
start_meas
(
&
p_profiler
->
cnProcPc
);
#endif
if
(
BG
==
1
)
{
pcRes
=
nrLDPC_cnProcPc_BG1
(
p_lut
,
cnProcBuf
,
cnProcBufRes
,
Z
);
}
else
{
pcRes
=
nrLDPC_cnProcPc_BG2
(
p_lut
,
cnProcBuf
,
cnProcBufRes
,
Z
);
}
#ifdef NR_LDPC_PROFILER_DETAIL
stop_meas
(
&
p_profiler
->
cnProcPc
);
#endif
#endif
}
// If maximum number of iterations reached an PC still fails increase number of iterations
// Thus, i > numMaxIter indicates that PC has failed
#ifdef NR_LDPC_ENABLE_PARITY_CHECK
if
(
pcRes
!=
0
)
{
i
++
;
}
#endif
}
// end while
// Last iteration
if
(
pcRes
!=
0
)
i
++
;
// Assign results from processing buffer to output
#ifdef NR_LDPC_PROFILER_DETAIL
start_meas
(
&
p_profiler
->
llrRes2llrOut
);
start_meas
(
&
p_profiler
->
llrRes2llrOut
);
#endif
nrLDPC_llrRes2llrOut
(
p_lut
,
p_llrOut
,
llrRes
,
Z
,
BG
);
nrLDPC_llrRes2llrOut
(
p_lut
,
p_llrOut
,
llrRes
,
Z
,
BG
);
#ifdef NR_LDPC_PROFILER_DETAIL
stop_meas
(
&
p_profiler
->
llrRes2llrOut
);
stop_meas
(
&
p_profiler
->
llrRes2llrOut
);
#endif
// Hard-decision
#ifdef NR_LDPC_PROFILER_DETAIL
start_meas
(
&
p_profiler
->
llr2bit
);
start_meas
(
&
p_profiler
->
llr2bit
);
#endif
if
(
outMode
==
nrLDPC_outMode_BIT
)
{
nrLDPC_llr2bitPacked
(
p_out
,
p_llrOut
,
numLLR
);
}
else
if
(
outMode
==
nrLDPC_outMode_BITINT8
)
{
nrLDPC_llr2bit
(
p_out
,
p_llrOut
,
numLLR
);
}
if
(
outMode
==
nrLDPC_outMode_BIT
)
nrLDPC_llr2bitPacked
(
p_out
,
p_llrOut
,
numLLR
);
else
//if (outMode == nrLDPC_outMode_BITINT8)
nrLDPC_llr2bit
(
p_out
,
p_llrOut
,
numLLR
);
#ifdef NR_LDPC_PROFILER_DETAIL
stop_meas
(
&
p_profiler
->
llr2bit
);
#endif
...
...
openair1/PHY/CODING/nrLDPC_decoder/nrLDPC_mPass.h
View file @
f6bb869c
...
...
@@ -41,15 +41,6 @@
\param Z Lifting size
\param cshift Circular shift
*/
//more faster memcpy by using "rep movsb", which on modern processors is highly optimized
void
*
memcpy1
(
void
*
dst
,
const
void
*
src
,
size_t
n
)
{
void
*
ret
=
dst
;
asm
volatile
(
"rep movsb"
:
"+D"
(
dst
)
:
"c"
(
n
),
"S"
(
src
)
:
"cc"
,
"memory"
);
return
ret
;
}
static
inline
void
*
nrLDPC_inv_circ_memcpy
(
int8_t
*
str1
,
const
int8_t
*
str2
,
uint16_t
Z
,
uint16_t
cshift
)
{
...
...
openair1/PHY/CODING/nrLDPC_decoder/nrLDPC_mPass.h_native_memcpy
deleted
100644 → 0
View file @
aabd9c6c
/*
* Licensed to the OpenAirInterface (OAI) Software Alliance under one or more
* contributor license agreements. See the NOTICE file distributed with
* this work for additional information regarding copyright ownership.
* The OpenAirInterface Software Alliance licenses this file to You under
* the OAI Public License, Version 1.1 (the "License"); you may not use this file
* except in compliance with the License.
* You may obtain a copy of the License at
*
* http://www.openairinterface.org/?page_id=698
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*-------------------------------------------------------------------------------
* For more information about the OpenAirInterface (OAI) Software Alliance:
* contact@openairinterface.org
*/
/*!\file nrLDPC_mPass.h
* \brief Defines the functions for message passing
* \author Sebastian Wagner (TCL Communications) Email: <mailto:sebastian.wagner@tcl.com>
* \date 30-09-2019
* \version 2.0
* \note
* \warning
*/
#ifndef __NR_LDPC_MPASS__H__
#define __NR_LDPC_MPASS__H__
#include <string.h>
#include "nrLDPCdecoder_defs.h"
//#include <omp.h>
/**
\brief Circular memcpy
|<- rem->|<- circular shift ->|
(src) str2 = |--------xxxxxxxxxxxxxxxxxxxxx|
\_______________
\
(dst) str1 = |xxxxxxxxxxxxxxxxxxxxx---------|
\param str1 Pointer to the start of the destination buffer
\param str2 Pointer to the source buffer
\param Z Lifting size
\param cshift Circular shift
*/
static inline void *nrLDPC_inv_circ_memcpy(int8_t *str1, const int8_t *str2, uint16_t Z, uint16_t cshift)
{
uint16_t rem = Z - cshift;
memcpy(str1+cshift, str2 , rem);
memcpy(str1 , str2+rem, cshift);
return(str1);
}
/**
\brief Inverse circular memcpy
|<- circular shift ->|<- rem->|
(src) str2 = |xxxxxxxxxxxxxxxxxxxx\--------|
\
(dst) str1 = |--------xxxxxxxxxxxxxxxxxxxxx|
\param str1 Pointer to the start of the destination buffer
\param str2 Pointer to the source buffer
\param Z Lifting size
\param cshift Circular shift
*/
static inline void *nrLDPC_circ_memcpy(int8_t *str1, const int8_t *str2, uint16_t Z, uint16_t cshift)
{
uint16_t rem = Z - cshift;
memcpy(str1 , str2+cshift, rem);
memcpy(str1+rem , str2 , cshift);
return(str1);
}
/**
\brief Copies the input LLRs to their corresponding place in the LLR processing buffer.
Example: BG2
| 0| 0| LLRs --> |
BN Groups |22|23|10| 5| 5|14| 7|13| 6| 8| 9|16| 9|12|1|1|...|1|
^---------------------------------------/---- /
_________________________/ | /
/ ____________________________|___/
/ / \
LLR Proc Buffer (BNG) | 1| 5| 6| 7| 8| 9|10|12|13|14|16|22|23|
Number BN in BNG(R15) |38| 2| 1| 1| 1| 2| 1| 1| 1| 1| 1| 1| 1|
Idx: 0 ^ ^ ^
38*384=14592 _____| ... | |
50*384=19200 ----------------------------------- |
51*384=19584 --------------------------------------
\param p_lut Pointer to decoder LUTs
\param llr Pointer to input LLRs
\param p_procBuf Pointer the processing buffers
\param Z Lifting size
\param BG Base graph
*/
static inline void nrLDPC_llr2llrProcBuf(t_nrLDPC_lut* p_lut, int8_t* llr, t_nrLDPC_procBuf* p_procBuf, uint16_t Z, uint8_t BG)
{
uint32_t i;
const uint8_t numBn2CnG1 = p_lut->numBnInBnGroups[0];
uint32_t startColParity = (BG ==1 ) ? (NR_LDPC_START_COL_PARITY_BG1) : (NR_LDPC_START_COL_PARITY_BG2);
uint32_t colG1 = startColParity*Z;
const uint16_t* lut_llr2llrProcBufAddr = p_lut->llr2llrProcBufAddr;
const uint8_t* lut_llr2llrProcBufBnPos = p_lut->llr2llrProcBufBnPos;
uint32_t idxBn;
int8_t* llrProcBuf = p_procBuf->llrProcBuf;
// Copy LLRs connected to 1 CN
if (numBn2CnG1 > 0)
{
memcpy(&llrProcBuf[0], &llr[colG1], numBn2CnG1*Z);
}
// First 2 columns might be set to zero directly if it's true they always belong to the groups with highest number of connected CNs...
for (i=0; i<startColParity; i++)
{
idxBn = lut_llr2llrProcBufAddr[i] + lut_llr2llrProcBufBnPos[i]*Z;
memcpy(&llrProcBuf[idxBn], llr, Z);
llr += Z;
}
}
/**
\brief Copies the input LLRs to their corresponding place in the CN processing buffer for BG1.
\param p_lut Pointer to decoder LUTs
\param llr Pointer to input LLRs
\param p_procBuf Pointer to the processing buffers
\param Z Lifting size
*/
static inline void nrLDPC_llr2CnProcBuf_BG1(t_nrLDPC_lut* p_lut, int8_t* llr, t_nrLDPC_procBuf* p_procBuf, uint16_t Z)
{
const uint16_t (*lut_circShift_CNG3) [lut_numCnInCnGroups_BG1_R13[0]] = (uint16_t(*)[lut_numCnInCnGroups_BG1_R13[0]]) p_lut->circShift[0];
const uint16_t (*lut_circShift_CNG4) [lut_numCnInCnGroups_BG1_R13[1]] = (uint16_t(*)[lut_numCnInCnGroups_BG1_R13[1]]) p_lut->circShift[1];
const uint16_t (*lut_circShift_CNG5) [lut_numCnInCnGroups_BG1_R13[2]] = (uint16_t(*)[lut_numCnInCnGroups_BG1_R13[2]]) p_lut->circShift[2];
const uint16_t (*lut_circShift_CNG6) [lut_numCnInCnGroups_BG1_R13[3]] = (uint16_t(*)[lut_numCnInCnGroups_BG1_R13[3]]) p_lut->circShift[3];
const uint16_t (*lut_circShift_CNG7) [lut_numCnInCnGroups_BG1_R13[4]] = (uint16_t(*)[lut_numCnInCnGroups_BG1_R13[4]]) p_lut->circShift[4];
const uint16_t (*lut_circShift_CNG8) [lut_numCnInCnGroups_BG1_R13[5]] = (uint16_t(*)[lut_numCnInCnGroups_BG1_R13[5]]) p_lut->circShift[5];
const uint16_t (*lut_circShift_CNG9) [lut_numCnInCnGroups_BG1_R13[6]] = (uint16_t(*)[lut_numCnInCnGroups_BG1_R13[6]]) p_lut->circShift[6];
const uint16_t (*lut_circShift_CNG10)[lut_numCnInCnGroups_BG1_R13[7]] = (uint16_t(*)[lut_numCnInCnGroups_BG1_R13[7]]) p_lut->circShift[7];
const uint16_t (*lut_circShift_CNG19)[lut_numCnInCnGroups_BG1_R13[8]] = (uint16_t(*)[lut_numCnInCnGroups_BG1_R13[8]]) p_lut->circShift[8];
const uint8_t (*lut_posBnInCnProcBuf_CNG3) [lut_numCnInCnGroups_BG1_R13[0]] = (uint8_t(*)[lut_numCnInCnGroups_BG1_R13[0]]) p_lut->posBnInCnProcBuf[0];
const uint8_t (*lut_posBnInCnProcBuf_CNG4) [lut_numCnInCnGroups_BG1_R13[1]] = (uint8_t(*)[lut_numCnInCnGroups_BG1_R13[1]]) p_lut->posBnInCnProcBuf[1];
const uint8_t (*lut_posBnInCnProcBuf_CNG5) [lut_numCnInCnGroups_BG1_R13[2]] = (uint8_t(*)[lut_numCnInCnGroups_BG1_R13[2]]) p_lut->posBnInCnProcBuf[2];
const uint8_t (*lut_posBnInCnProcBuf_CNG6) [lut_numCnInCnGroups_BG1_R13[3]] = (uint8_t(*)[lut_numCnInCnGroups_BG1_R13[3]]) p_lut->posBnInCnProcBuf[3];
const uint8_t (*lut_posBnInCnProcBuf_CNG7) [lut_numCnInCnGroups_BG1_R13[4]] = (uint8_t(*)[lut_numCnInCnGroups_BG1_R13[4]]) p_lut->posBnInCnProcBuf[4];
const uint8_t (*lut_posBnInCnProcBuf_CNG8) [lut_numCnInCnGroups_BG1_R13[5]] = (uint8_t(*)[lut_numCnInCnGroups_BG1_R13[5]]) p_lut->posBnInCnProcBuf[5];
const uint8_t (*lut_posBnInCnProcBuf_CNG9) [lut_numCnInCnGroups_BG1_R13[6]] = (uint8_t(*)[lut_numCnInCnGroups_BG1_R13[6]]) p_lut->posBnInCnProcBuf[6];
const uint8_t (*lut_posBnInCnProcBuf_CNG10)[lut_numCnInCnGroups_BG1_R13[7]] = (uint8_t(*)[lut_numCnInCnGroups_BG1_R13[7]]) p_lut->posBnInCnProcBuf[7];
const uint8_t (*lut_posBnInCnProcBuf_CNG19)[lut_numCnInCnGroups_BG1_R13[8]] = (uint8_t(*)[lut_numCnInCnGroups_BG1_R13[8]]) p_lut->posBnInCnProcBuf[8];
const uint8_t* lut_numCnInCnGroups = p_lut->numCnInCnGroups;
const uint32_t* lut_startAddrCnGroups = p_lut->startAddrCnGroups;
int8_t* cnProcBuf = p_procBuf->cnProcBuf;
uint32_t i;
uint32_t j;
uint32_t idxBn = 0;
int8_t* p_cnProcBuf;
uint32_t bitOffsetInGroup;
// =====================================================================
// CN group with 3 BNs
bitOffsetInGroup = lut_numCnInCnGroups_BG1_R13[0]*NR_LDPC_ZMAX;
// #pragma omp simd
// #pragma omp parallel for schedule(dynamic)
for (j=0; j<3; j++)
{
p_cnProcBuf = &cnProcBuf[lut_startAddrCnGroups[0] + j*bitOffsetInGroup];
idxBn = lut_posBnInCnProcBuf_CNG3[j][0]*Z;
nrLDPC_circ_memcpy(p_cnProcBuf, &llr[idxBn], Z, lut_circShift_CNG3[j][0]);
}
// =====================================================================
// CN group with 4 BNs
bitOffsetInGroup = lut_numCnInCnGroups_BG1_R13[1]*NR_LDPC_ZMAX;
for (j=0; j<4; j++)
{
p_cnProcBuf = &cnProcBuf[lut_startAddrCnGroups[1] + j*bitOffsetInGroup];
for (i=0; i<lut_numCnInCnGroups[1]; i++)
{
idxBn = lut_posBnInCnProcBuf_CNG4[j][i]*Z;
nrLDPC_circ_memcpy(p_cnProcBuf, &llr[idxBn], Z, lut_circShift_CNG4[j][i]);
p_cnProcBuf += Z;
}
}
// =====================================================================
// CN group with 5 BNs
bitOffsetInGroup = lut_numCnInCnGroups_BG1_R13[2]*NR_LDPC_ZMAX;
for (j=0; j<5; j++)
{
p_cnProcBuf = &cnProcBuf[lut_startAddrCnGroups[2] + j*bitOffsetInGroup];
for (i=0; i<lut_numCnInCnGroups[2]; i++)
{
idxBn = lut_posBnInCnProcBuf_CNG5[j][i]*Z;
nrLDPC_circ_memcpy(p_cnProcBuf, &llr[idxBn], Z, lut_circShift_CNG5[j][i]);
p_cnProcBuf += Z;
}
}
// =====================================================================
// CN group with 6 BNs
bitOffsetInGroup = lut_numCnInCnGroups_BG1_R13[3]*NR_LDPC_ZMAX;
for (j=0; j<6; j++)
{
p_cnProcBuf = &cnProcBuf[lut_startAddrCnGroups[3] + j*bitOffsetInGroup];
for (i=0; i<lut_numCnInCnGroups[3]; i++)
{
idxBn = lut_posBnInCnProcBuf_CNG6[j][i]*Z;
nrLDPC_circ_memcpy(p_cnProcBuf, &llr[idxBn], Z, lut_circShift_CNG6[j][i]);
p_cnProcBuf += Z;
}
}
// =====================================================================
// CN group with 7 BNs
bitOffsetInGroup = lut_numCnInCnGroups_BG1_R13[4]*NR_LDPC_ZMAX;
for (j=0; j<7; j++)
{
p_cnProcBuf = &cnProcBuf[lut_startAddrCnGroups[4] + j*bitOffsetInGroup];
for (i=0; i<lut_numCnInCnGroups[4]; i++)
{
idxBn = lut_posBnInCnProcBuf_CNG7[j][i]*Z;
nrLDPC_circ_memcpy(p_cnProcBuf, &llr[idxBn], Z, lut_circShift_CNG7[j][i]);
p_cnProcBuf += Z;
}
}
// =====================================================================
// CN group with 8 BNs
bitOffsetInGroup = lut_numCnInCnGroups_BG1_R13[5]*NR_LDPC_ZMAX;
for (j=0; j<8; j++)
{
p_cnProcBuf = &cnProcBuf[lut_startAddrCnGroups[5] + j*bitOffsetInGroup];
for (i=0; i<lut_numCnInCnGroups[5]; i++)
{
idxBn = lut_posBnInCnProcBuf_CNG8[j][i]*Z;
nrLDPC_circ_memcpy(p_cnProcBuf, &llr[idxBn], Z, lut_circShift_CNG8[j][i]);
p_cnProcBuf += Z;
}
}
// =====================================================================
// CN group with 9 BNs
bitOffsetInGroup = lut_numCnInCnGroups_BG1_R13[6]*NR_LDPC_ZMAX;
for (j=0; j<9; j++)
{
p_cnProcBuf = &cnProcBuf[lut_startAddrCnGroups[6] + j*bitOffsetInGroup];
for (i=0; i<lut_numCnInCnGroups[6]; i++)
{
idxBn = lut_posBnInCnProcBuf_CNG9[j][i]*Z;
nrLDPC_circ_memcpy(p_cnProcBuf, &llr[idxBn], Z, lut_circShift_CNG9[j][i]);
p_cnProcBuf += Z;
}
}
// =====================================================================
// CN group with 10 BNs
bitOffsetInGroup = lut_numCnInCnGroups_BG1_R13[7]*NR_LDPC_ZMAX;
for (j=0; j<10; j++)
{
p_cnProcBuf = &cnProcBuf[lut_startAddrCnGroups[7] + j*bitOffsetInGroup];
for (i=0; i<lut_numCnInCnGroups[7]; i++)
{
idxBn = lut_posBnInCnProcBuf_CNG10[j][i]*Z;
nrLDPC_circ_memcpy(p_cnProcBuf, &llr[idxBn], Z, lut_circShift_CNG10[j][i]);
p_cnProcBuf += Z;
}
}
// =====================================================================
// CN group with 19 BNs
bitOffsetInGroup = lut_numCnInCnGroups_BG1_R13[8]*NR_LDPC_ZMAX;
for (j=0; j<19; j++)
{
p_cnProcBuf = &cnProcBuf[lut_startAddrCnGroups[8] + j*bitOffsetInGroup];
for (i=0; i<lut_numCnInCnGroups[8]; i++)
{
idxBn = lut_posBnInCnProcBuf_CNG19[j][i]*Z;
nrLDPC_circ_memcpy(p_cnProcBuf, &llr[idxBn], Z, lut_circShift_CNG19[j][i]);
p_cnProcBuf += Z;
}
}
}
/**
\brief Copies the input LLRs to their corresponding place in the CN processing buffer for BG2.
Example: BG2
| 0| 0| LLRs --> |
BN Groups |22|23|10| 5| 5|14| 7|13| 6| 8| 9|16| 9|12|1|1|...|1|
CN Processing Buffer (CNGs) | 3| 4| 5| 6| 8|10|
Number of CN per CNG (R15) | 6|20| 9| 3| 2| 2|
0 ^ ^\ \
3*6*384=6912 _________| || \_____________
(3*6+4*20+5*9)*384=54912____|| \
Bit | 1| 2| 3| 4| 5| 6|
3*Z CNs>| |<
^
54912 + 3*384______|
\param p_lut Pointer to decoder LUTs
\param llr Pointer to input LLRs
\param p_procBuf Pointer to the processing buffers
\param Z Lifting size
*/
static inline void nrLDPC_llr2CnProcBuf_BG2(t_nrLDPC_lut* p_lut, int8_t* llr, t_nrLDPC_procBuf* p_procBuf, uint16_t Z)
{
const uint16_t (*lut_circShift_CNG3) [lut_numCnInCnGroups_BG2_R15[0]] = (uint16_t(*)[lut_numCnInCnGroups_BG2_R15[0]]) p_lut->circShift[0];
const uint16_t (*lut_circShift_CNG4) [lut_numCnInCnGroups_BG2_R15[1]] = (uint16_t(*)[lut_numCnInCnGroups_BG2_R15[1]]) p_lut->circShift[1];
const uint16_t (*lut_circShift_CNG5) [lut_numCnInCnGroups_BG2_R15[2]] = (uint16_t(*)[lut_numCnInCnGroups_BG2_R15[2]]) p_lut->circShift[2];
const uint16_t (*lut_circShift_CNG6) [lut_numCnInCnGroups_BG2_R15[3]] = (uint16_t(*)[lut_numCnInCnGroups_BG2_R15[3]]) p_lut->circShift[3];
const uint16_t (*lut_circShift_CNG8) [lut_numCnInCnGroups_BG2_R15[4]] = (uint16_t(*)[lut_numCnInCnGroups_BG2_R15[4]]) p_lut->circShift[4];
const uint16_t (*lut_circShift_CNG10) [lut_numCnInCnGroups_BG2_R15[5]] = (uint16_t(*)[lut_numCnInCnGroups_BG2_R15[5]]) p_lut->circShift[5];
const uint8_t (*lut_posBnInCnProcBuf_CNG3) [lut_numCnInCnGroups_BG2_R15[0]] = (uint8_t(*)[lut_numCnInCnGroups_BG2_R15[0]]) p_lut->posBnInCnProcBuf[0];
const uint8_t (*lut_posBnInCnProcBuf_CNG4) [lut_numCnInCnGroups_BG2_R15[1]] = (uint8_t(*)[lut_numCnInCnGroups_BG2_R15[1]]) p_lut->posBnInCnProcBuf[1];
const uint8_t (*lut_posBnInCnProcBuf_CNG5) [lut_numCnInCnGroups_BG2_R15[2]] = (uint8_t(*)[lut_numCnInCnGroups_BG2_R15[2]]) p_lut->posBnInCnProcBuf[2];
const uint8_t (*lut_posBnInCnProcBuf_CNG6) [lut_numCnInCnGroups_BG2_R15[3]] = (uint8_t(*)[lut_numCnInCnGroups_BG2_R15[3]]) p_lut->posBnInCnProcBuf[3];
const uint8_t (*lut_posBnInCnProcBuf_CNG8) [lut_numCnInCnGroups_BG2_R15[4]] = (uint8_t(*)[lut_numCnInCnGroups_BG2_R15[4]]) p_lut->posBnInCnProcBuf[4];
const uint8_t (*lut_posBnInCnProcBuf_CNG10) [lut_numCnInCnGroups_BG2_R15[5]] = (uint8_t(*)[lut_numCnInCnGroups_BG2_R15[5]]) p_lut->posBnInCnProcBuf[5];
const uint8_t* lut_numCnInCnGroups = p_lut->numCnInCnGroups;
const uint32_t* lut_startAddrCnGroups = p_lut->startAddrCnGroups;
int8_t* cnProcBuf = p_procBuf->cnProcBuf;
uint32_t i;
uint32_t j;
uint32_t idxBn = 0;
int8_t* p_cnProcBuf;
uint32_t bitOffsetInGroup;
// =====================================================================
// CN group with 3 BNs
bitOffsetInGroup = lut_numCnInCnGroups_BG2_R15[0]*NR_LDPC_ZMAX;
for (j=0; j<3; j++)
{
p_cnProcBuf = &cnProcBuf[lut_startAddrCnGroups[0] + j*bitOffsetInGroup];
for (i=0; i<lut_numCnInCnGroups[0]; i++)
{
idxBn = lut_posBnInCnProcBuf_CNG3[j][i]*Z;
nrLDPC_circ_memcpy(p_cnProcBuf, &llr[idxBn], Z, lut_circShift_CNG3[j][i]);
p_cnProcBuf += Z;
}
}
// =====================================================================
// CN group with 4 BNs
bitOffsetInGroup = lut_numCnInCnGroups_BG2_R15[1]*NR_LDPC_ZMAX;
for (j=0; j<4; j++)
{
p_cnProcBuf = &cnProcBuf[lut_startAddrCnGroups[1] + j*bitOffsetInGroup];
for (i=0; i<lut_numCnInCnGroups[1]; i++)
{
idxBn = lut_posBnInCnProcBuf_CNG4[j][i]*Z;
nrLDPC_circ_memcpy(p_cnProcBuf, &llr[idxBn], Z, lut_circShift_CNG4[j][i]);
p_cnProcBuf += Z;
}
}
// =====================================================================
// CN group with 5 BNs
bitOffsetInGroup = lut_numCnInCnGroups_BG2_R15[2]*NR_LDPC_ZMAX;
for (j=0; j<5; j++)
{
p_cnProcBuf = &cnProcBuf[lut_startAddrCnGroups[2] + j*bitOffsetInGroup];
for (i=0; i<lut_numCnInCnGroups[2]; i++)
{
idxBn = lut_posBnInCnProcBuf_CNG5[j][i]*Z;
nrLDPC_circ_memcpy(p_cnProcBuf, &llr[idxBn], Z, lut_circShift_CNG5[j][i]);
p_cnProcBuf += Z;
}
}
// =====================================================================
// CN group with 6 BNs
bitOffsetInGroup = lut_numCnInCnGroups_BG2_R15[3]*NR_LDPC_ZMAX;
for (j=0; j<6; j++)
{
p_cnProcBuf = &cnProcBuf[lut_startAddrCnGroups[3] + j*bitOffsetInGroup];
for (i=0; i<lut_numCnInCnGroups[3]; i++)
{
idxBn = lut_posBnInCnProcBuf_CNG6[j][i]*Z;
nrLDPC_circ_memcpy(p_cnProcBuf, &llr[idxBn], Z, lut_circShift_CNG6[j][i]);
p_cnProcBuf += Z;
}
}
// =====================================================================
// CN group with 8 BNs
bitOffsetInGroup = lut_numCnInCnGroups_BG2_R15[4]*NR_LDPC_ZMAX;
for (j=0; j<8; j++)
{
p_cnProcBuf = &cnProcBuf[lut_startAddrCnGroups[4] + j*bitOffsetInGroup];
for (i=0; i<lut_numCnInCnGroups[4]; i++)
{
idxBn = lut_posBnInCnProcBuf_CNG8[j][i]*Z;
nrLDPC_circ_memcpy(p_cnProcBuf, &llr[idxBn], Z, lut_circShift_CNG8[j][i]);
p_cnProcBuf += Z;
}
}
// =====================================================================
// CN group with 10 BNs
bitOffsetInGroup = lut_numCnInCnGroups_BG2_R15[5]*NR_LDPC_ZMAX;
for (j=0; j<10; j++)
{
p_cnProcBuf = &cnProcBuf[lut_startAddrCnGroups[5] + j*bitOffsetInGroup];
for (i=0; i<lut_numCnInCnGroups[5]; i++)
{
idxBn = lut_posBnInCnProcBuf_CNG10[j][i]*Z;
nrLDPC_circ_memcpy(p_cnProcBuf, &llr[idxBn], Z, lut_circShift_CNG10[j][i]);
p_cnProcBuf += Z;
}
}
}
/**
\brief Copies the values in the CN processing results buffer to their corresponding place in the BN processing buffer for BG2.
\param p_lut Pointer to decoder LUTs
\param p_procBuf Pointer to the processing buffers
\param Z Lifting size
*/
static inline void nrLDPC_cn2bnProcBuf_BG2(t_nrLDPC_lut* p_lut, t_nrLDPC_procBuf* p_procBuf, uint16_t Z)
{
const uint8_t* lut_numCnInCnGroups = p_lut->numCnInCnGroups;
const uint32_t* lut_startAddrCnGroups = p_lut->startAddrCnGroups;
const uint16_t (*lut_circShift_CNG3) [lut_numCnInCnGroups_BG2_R15[0]] = (uint16_t(*)[lut_numCnInCnGroups_BG2_R15[0]]) p_lut->circShift[0];
const uint16_t (*lut_circShift_CNG4) [lut_numCnInCnGroups_BG2_R15[1]] = (uint16_t(*)[lut_numCnInCnGroups_BG2_R15[1]]) p_lut->circShift[1];
const uint16_t (*lut_circShift_CNG5) [lut_numCnInCnGroups_BG2_R15[2]] = (uint16_t(*)[lut_numCnInCnGroups_BG2_R15[2]]) p_lut->circShift[2];
const uint16_t (*lut_circShift_CNG6) [lut_numCnInCnGroups_BG2_R15[3]] = (uint16_t(*)[lut_numCnInCnGroups_BG2_R15[3]]) p_lut->circShift[3];
const uint16_t (*lut_circShift_CNG8) [lut_numCnInCnGroups_BG2_R15[4]] = (uint16_t(*)[lut_numCnInCnGroups_BG2_R15[4]]) p_lut->circShift[4];
const uint16_t (*lut_circShift_CNG10) [lut_numCnInCnGroups_BG2_R15[5]] = (uint16_t(*)[lut_numCnInCnGroups_BG2_R15[5]]) p_lut->circShift[5];
const uint32_t (*lut_startAddrBnProcBuf_CNG3) [lut_numCnInCnGroups[0]] = (uint32_t(*)[lut_numCnInCnGroups[0]]) p_lut->startAddrBnProcBuf[0];
const uint32_t (*lut_startAddrBnProcBuf_CNG4) [lut_numCnInCnGroups[1]] = (uint32_t(*)[lut_numCnInCnGroups[1]]) p_lut->startAddrBnProcBuf[1];
const uint32_t (*lut_startAddrBnProcBuf_CNG5) [lut_numCnInCnGroups[2]] = (uint32_t(*)[lut_numCnInCnGroups[2]]) p_lut->startAddrBnProcBuf[2];
const uint32_t (*lut_startAddrBnProcBuf_CNG6) [lut_numCnInCnGroups[3]] = (uint32_t(*)[lut_numCnInCnGroups[3]]) p_lut->startAddrBnProcBuf[3];
const uint32_t (*lut_startAddrBnProcBuf_CNG8) [lut_numCnInCnGroups[4]] = (uint32_t(*)[lut_numCnInCnGroups[4]]) p_lut->startAddrBnProcBuf[4];
const uint32_t (*lut_startAddrBnProcBuf_CNG10) [lut_numCnInCnGroups[5]] = (uint32_t(*)[lut_numCnInCnGroups[5]]) p_lut->startAddrBnProcBuf[5];
const uint8_t (*lut_bnPosBnProcBuf_CNG3) [lut_numCnInCnGroups[0]] = (uint8_t(*)[lut_numCnInCnGroups[0]]) p_lut->bnPosBnProcBuf[0];
const uint8_t (*lut_bnPosBnProcBuf_CNG4) [lut_numCnInCnGroups[1]] = (uint8_t(*)[lut_numCnInCnGroups[1]]) p_lut->bnPosBnProcBuf[1];
const uint8_t (*lut_bnPosBnProcBuf_CNG5) [lut_numCnInCnGroups[2]] = (uint8_t(*)[lut_numCnInCnGroups[2]]) p_lut->bnPosBnProcBuf[2];
const uint8_t (*lut_bnPosBnProcBuf_CNG6) [lut_numCnInCnGroups[3]] = (uint8_t(*)[lut_numCnInCnGroups[3]]) p_lut->bnPosBnProcBuf[3];
const uint8_t (*lut_bnPosBnProcBuf_CNG8) [lut_numCnInCnGroups[4]] = (uint8_t(*)[lut_numCnInCnGroups[4]]) p_lut->bnPosBnProcBuf[4];
const uint8_t (*lut_bnPosBnProcBuf_CNG10) [lut_numCnInCnGroups[5]] = (uint8_t(*)[lut_numCnInCnGroups[5]]) p_lut->bnPosBnProcBuf[5];
int8_t* cnProcBufRes = p_procBuf->cnProcBufRes;
int8_t* bnProcBuf = p_procBuf->bnProcBuf;
int8_t* p_cnProcBufRes;
uint32_t bitOffsetInGroup;
uint32_t i;
uint32_t j;
uint32_t idxBn = 0;
// =====================================================================
// CN group with 3 BNs
bitOffsetInGroup = lut_numCnInCnGroups_BG2_R15[0]*NR_LDPC_ZMAX;
for (j=0; j<3; j++)
{
p_cnProcBufRes = &cnProcBufRes[lut_startAddrCnGroups[0] + j*bitOffsetInGroup];
for (i=0; i<lut_numCnInCnGroups[0]; i++)
{
idxBn = lut_startAddrBnProcBuf_CNG3[j][i] + lut_bnPosBnProcBuf_CNG3[j][i]*Z;
nrLDPC_inv_circ_memcpy(&bnProcBuf[idxBn],p_cnProcBufRes,Z,lut_circShift_CNG3[j][i]);
p_cnProcBufRes += Z;
}
}
// =====================================================================
// CN group with 4 BNs
bitOffsetInGroup = lut_numCnInCnGroups_BG2_R15[1]*NR_LDPC_ZMAX;
for (j=0; j<4; j++)
{
p_cnProcBufRes = &cnProcBufRes[lut_startAddrCnGroups[1] + j*bitOffsetInGroup];
for (i=0; i<lut_numCnInCnGroups[1]; i++)
{
idxBn = lut_startAddrBnProcBuf_CNG4[j][i] + lut_bnPosBnProcBuf_CNG4[j][i]*Z;
nrLDPC_inv_circ_memcpy(&bnProcBuf[idxBn],p_cnProcBufRes,Z,lut_circShift_CNG4[j][i]);
p_cnProcBufRes += Z;
}
}
// =====================================================================
// CN group with 5 BNs
bitOffsetInGroup = lut_numCnInCnGroups_BG2_R15[2]*NR_LDPC_ZMAX;
for (j=0; j<5; j++)
{
p_cnProcBufRes = &cnProcBufRes[lut_startAddrCnGroups[2] + j*bitOffsetInGroup];
for (i=0; i<lut_numCnInCnGroups[2]; i++)
{
idxBn = lut_startAddrBnProcBuf_CNG5[j][i] + lut_bnPosBnProcBuf_CNG5[j][i]*Z;
nrLDPC_inv_circ_memcpy(&bnProcBuf[idxBn],p_cnProcBufRes,Z,lut_circShift_CNG5[j][i]);
p_cnProcBufRes += Z;
}
}
// =====================================================================
// CN group with 6 BNs
bitOffsetInGroup = lut_numCnInCnGroups_BG2_R15[3]*NR_LDPC_ZMAX;
for (j=0; j<6; j++)
{
p_cnProcBufRes = &cnProcBufRes[lut_startAddrCnGroups[3] + j*bitOffsetInGroup];
for (i=0; i<lut_numCnInCnGroups[3]; i++)
{
idxBn = lut_startAddrBnProcBuf_CNG6[j][i] + lut_bnPosBnProcBuf_CNG6[j][i]*Z;
nrLDPC_inv_circ_memcpy(&bnProcBuf[idxBn],p_cnProcBufRes,Z,lut_circShift_CNG6[j][i]);
p_cnProcBufRes += Z;
}
}
// =====================================================================
// CN group with 8 BNs
bitOffsetInGroup = lut_numCnInCnGroups_BG2_R15[4]*NR_LDPC_ZMAX;
for (j=0; j<8; j++)
{
p_cnProcBufRes = &cnProcBufRes[lut_startAddrCnGroups[4] + j*bitOffsetInGroup];
for (i=0; i<lut_numCnInCnGroups[4]; i++)
{
idxBn = lut_startAddrBnProcBuf_CNG8[j][i] + lut_bnPosBnProcBuf_CNG8[j][i]*Z;
nrLDPC_inv_circ_memcpy(&bnProcBuf[idxBn],p_cnProcBufRes,Z,lut_circShift_CNG8[j][i]);
p_cnProcBufRes += Z;
}
}
// =====================================================================
// CN group with 10 BNs
bitOffsetInGroup = lut_numCnInCnGroups_BG2_R15[5]*NR_LDPC_ZMAX;
for (j=0; j<10; j++)
{
p_cnProcBufRes = &cnProcBufRes[lut_startAddrCnGroups[5] + j*bitOffsetInGroup];
for (i=0; i<lut_numCnInCnGroups[5]; i++)
{
idxBn = lut_startAddrBnProcBuf_CNG10[j][i] + lut_bnPosBnProcBuf_CNG10[j][i]*Z;
nrLDPC_inv_circ_memcpy(&bnProcBuf[idxBn],p_cnProcBufRes,Z,lut_circShift_CNG10[j][i]);
p_cnProcBufRes += Z;
}
}
}
/**
\brief Copies the values in the CN processing results buffer to their corresponding place in the BN processing buffer for BG1.
\param p_lut Pointer to decoder LUTs
\param p_procBuf Pointer to the processing buffers
\param Z Lifting size
*/
static inline void nrLDPC_cn2bnProcBuf_BG1(t_nrLDPC_lut* p_lut, t_nrLDPC_procBuf* p_procBuf, uint16_t Z)
{
const uint8_t* lut_numCnInCnGroups = p_lut->numCnInCnGroups;
const uint32_t* lut_startAddrCnGroups = p_lut->startAddrCnGroups;
const uint16_t (*lut_circShift_CNG3) [lut_numCnInCnGroups_BG1_R13[0]] = (uint16_t(*)[lut_numCnInCnGroups_BG1_R13[0]]) p_lut->circShift[0];
const uint16_t (*lut_circShift_CNG4) [lut_numCnInCnGroups_BG1_R13[1]] = (uint16_t(*)[lut_numCnInCnGroups_BG1_R13[1]]) p_lut->circShift[1];
const uint16_t (*lut_circShift_CNG5) [lut_numCnInCnGroups_BG1_R13[2]] = (uint16_t(*)[lut_numCnInCnGroups_BG1_R13[2]]) p_lut->circShift[2];
const uint16_t (*lut_circShift_CNG6) [lut_numCnInCnGroups_BG1_R13[3]] = (uint16_t(*)[lut_numCnInCnGroups_BG1_R13[3]]) p_lut->circShift[3];
const uint16_t (*lut_circShift_CNG7) [lut_numCnInCnGroups_BG1_R13[4]] = (uint16_t(*)[lut_numCnInCnGroups_BG1_R13[4]]) p_lut->circShift[4];
const uint16_t (*lut_circShift_CNG8) [lut_numCnInCnGroups_BG1_R13[5]] = (uint16_t(*)[lut_numCnInCnGroups_BG1_R13[5]]) p_lut->circShift[5];
const uint16_t (*lut_circShift_CNG9) [lut_numCnInCnGroups_BG1_R13[6]] = (uint16_t(*)[lut_numCnInCnGroups_BG1_R13[6]]) p_lut->circShift[6];
const uint16_t (*lut_circShift_CNG10)[lut_numCnInCnGroups_BG1_R13[7]] = (uint16_t(*)[lut_numCnInCnGroups_BG1_R13[7]]) p_lut->circShift[7];
const uint16_t (*lut_circShift_CNG19)[lut_numCnInCnGroups_BG1_R13[8]] = (uint16_t(*)[lut_numCnInCnGroups_BG1_R13[8]]) p_lut->circShift[8];
const uint32_t (*lut_startAddrBnProcBuf_CNG3) [lut_numCnInCnGroups[0]] = (uint32_t(*)[lut_numCnInCnGroups[0]]) p_lut->startAddrBnProcBuf[0];
const uint32_t (*lut_startAddrBnProcBuf_CNG4) [lut_numCnInCnGroups[1]] = (uint32_t(*)[lut_numCnInCnGroups[1]]) p_lut->startAddrBnProcBuf[1];
const uint32_t (*lut_startAddrBnProcBuf_CNG5) [lut_numCnInCnGroups[2]] = (uint32_t(*)[lut_numCnInCnGroups[2]]) p_lut->startAddrBnProcBuf[2];
const uint32_t (*lut_startAddrBnProcBuf_CNG6) [lut_numCnInCnGroups[3]] = (uint32_t(*)[lut_numCnInCnGroups[3]]) p_lut->startAddrBnProcBuf[3];
const uint32_t (*lut_startAddrBnProcBuf_CNG7) [lut_numCnInCnGroups[4]] = (uint32_t(*)[lut_numCnInCnGroups[4]]) p_lut->startAddrBnProcBuf[4];
const uint32_t (*lut_startAddrBnProcBuf_CNG8) [lut_numCnInCnGroups[5]] = (uint32_t(*)[lut_numCnInCnGroups[5]]) p_lut->startAddrBnProcBuf[5];
const uint32_t (*lut_startAddrBnProcBuf_CNG9) [lut_numCnInCnGroups[6]] = (uint32_t(*)[lut_numCnInCnGroups[6]]) p_lut->startAddrBnProcBuf[6];
const uint32_t (*lut_startAddrBnProcBuf_CNG10)[lut_numCnInCnGroups[7]] = (uint32_t(*)[lut_numCnInCnGroups[7]]) p_lut->startAddrBnProcBuf[7];
const uint32_t (*lut_startAddrBnProcBuf_CNG19)[lut_numCnInCnGroups[8]] = (uint32_t(*)[lut_numCnInCnGroups[8]]) p_lut->startAddrBnProcBuf[8];
const uint8_t (*lut_bnPosBnProcBuf_CNG4) [lut_numCnInCnGroups[1]] = (uint8_t(*)[lut_numCnInCnGroups[1]]) p_lut->bnPosBnProcBuf[1];
const uint8_t (*lut_bnPosBnProcBuf_CNG5) [lut_numCnInCnGroups[2]] = (uint8_t(*)[lut_numCnInCnGroups[2]]) p_lut->bnPosBnProcBuf[2];
const uint8_t (*lut_bnPosBnProcBuf_CNG6) [lut_numCnInCnGroups[3]] = (uint8_t(*)[lut_numCnInCnGroups[3]]) p_lut->bnPosBnProcBuf[3];
const uint8_t (*lut_bnPosBnProcBuf_CNG7) [lut_numCnInCnGroups[4]] = (uint8_t(*)[lut_numCnInCnGroups[4]]) p_lut->bnPosBnProcBuf[4];
const uint8_t (*lut_bnPosBnProcBuf_CNG8) [lut_numCnInCnGroups[5]] = (uint8_t(*)[lut_numCnInCnGroups[5]]) p_lut->bnPosBnProcBuf[5];
const uint8_t (*lut_bnPosBnProcBuf_CNG9) [lut_numCnInCnGroups[6]] = (uint8_t(*)[lut_numCnInCnGroups[6]]) p_lut->bnPosBnProcBuf[6];
const uint8_t (*lut_bnPosBnProcBuf_CNG10)[lut_numCnInCnGroups[7]] = (uint8_t(*)[lut_numCnInCnGroups[7]]) p_lut->bnPosBnProcBuf[7];
const uint8_t (*lut_bnPosBnProcBuf_CNG19)[lut_numCnInCnGroups[8]] = (uint8_t(*)[lut_numCnInCnGroups[8]]) p_lut->bnPosBnProcBuf[8];
int8_t* cnProcBufRes = p_procBuf->cnProcBufRes;
int8_t* bnProcBuf = p_procBuf->bnProcBuf;
int8_t* p_cnProcBufRes;
uint32_t bitOffsetInGroup;
uint32_t i;
uint32_t j;
uint32_t idxBn = 0;
// =====================================================================
// CN group with 3 BNs
bitOffsetInGroup = lut_numCnInCnGroups_BG1_R13[0]*NR_LDPC_ZMAX;
for (j=0; j<3; j++)
{
p_cnProcBufRes = &cnProcBufRes[lut_startAddrCnGroups[0] + j*bitOffsetInGroup];
nrLDPC_inv_circ_memcpy(&bnProcBuf[lut_startAddrBnProcBuf_CNG3[j][0]],p_cnProcBufRes,Z,lut_circShift_CNG3[j][0]);
}
// =====================================================================
// CN group with 4 BNs
bitOffsetInGroup = lut_numCnInCnGroups_BG1_R13[1]*NR_LDPC_ZMAX;
for (j=0; j<4; j++)
{
p_cnProcBufRes = &cnProcBufRes[lut_startAddrCnGroups[1] + j*bitOffsetInGroup];
for (i=0; i<lut_numCnInCnGroups[1]; i++)
{
idxBn = lut_startAddrBnProcBuf_CNG4[j][i] + lut_bnPosBnProcBuf_CNG4[j][i]*Z;
nrLDPC_inv_circ_memcpy(&bnProcBuf[idxBn],p_cnProcBufRes,Z,lut_circShift_CNG4[j][i]);
p_cnProcBufRes += Z;
}
}
// =====================================================================
// CN group with 5 BNs
bitOffsetInGroup = lut_numCnInCnGroups_BG1_R13[2]*NR_LDPC_ZMAX;
for (j=0; j<5; j++)
{
p_cnProcBufRes = &cnProcBufRes[lut_startAddrCnGroups[2] + j*bitOffsetInGroup];
for (i=0; i<lut_numCnInCnGroups[2]; i++)
{
idxBn = lut_startAddrBnProcBuf_CNG5[j][i] + lut_bnPosBnProcBuf_CNG5[j][i]*Z;
nrLDPC_inv_circ_memcpy(&bnProcBuf[idxBn],p_cnProcBufRes,Z,lut_circShift_CNG5[j][i]);
p_cnProcBufRes += Z;
}
}
// =====================================================================
// CN group with 6 BNs
bitOffsetInGroup = lut_numCnInCnGroups_BG1_R13[3]*NR_LDPC_ZMAX;
for (j=0; j<6; j++)
{
p_cnProcBufRes = &cnProcBufRes[lut_startAddrCnGroups[3] + j*bitOffsetInGroup];
for (i=0; i<lut_numCnInCnGroups[3]; i++)
{
idxBn = lut_startAddrBnProcBuf_CNG6[j][i] + lut_bnPosBnProcBuf_CNG6[j][i]*Z;
nrLDPC_inv_circ_memcpy(&bnProcBuf[idxBn],p_cnProcBufRes,Z,lut_circShift_CNG6[j][i]);
p_cnProcBufRes += Z;
}
}
// =====================================================================
// CN group with 7 BNs
bitOffsetInGroup = lut_numCnInCnGroups_BG1_R13[4]*NR_LDPC_ZMAX;
for (j=0; j<7; j++)
{
p_cnProcBufRes = &cnProcBufRes[lut_startAddrCnGroups[4] + j*bitOffsetInGroup];
for (i=0; i<lut_numCnInCnGroups[4]; i++)
{
idxBn = lut_startAddrBnProcBuf_CNG7[j][i] + lut_bnPosBnProcBuf_CNG7[j][i]*Z;
nrLDPC_inv_circ_memcpy(&bnProcBuf[idxBn],p_cnProcBufRes,Z,lut_circShift_CNG7[j][i]);
p_cnProcBufRes += Z;
}
}
// =====================================================================
// CN group with 8 BNs
bitOffsetInGroup = lut_numCnInCnGroups_BG1_R13[5]*NR_LDPC_ZMAX;
for (j=0; j<8; j++)
{
p_cnProcBufRes = &cnProcBufRes[lut_startAddrCnGroups[5] + j*bitOffsetInGroup];
for (i=0; i<lut_numCnInCnGroups[5]; i++)
{
idxBn = lut_startAddrBnProcBuf_CNG8[j][i] + lut_bnPosBnProcBuf_CNG8[j][i]*Z;
nrLDPC_inv_circ_memcpy(&bnProcBuf[idxBn],p_cnProcBufRes,Z,lut_circShift_CNG8[j][i]);
p_cnProcBufRes += Z;
}
}
// =====================================================================
// CN group with 9 BNs
bitOffsetInGroup = lut_numCnInCnGroups_BG1_R13[6]*NR_LDPC_ZMAX;
for (j=0; j<9; j++)
{
p_cnProcBufRes = &cnProcBufRes[lut_startAddrCnGroups[6] + j*bitOffsetInGroup];
for (i=0; i<lut_numCnInCnGroups[6]; i++)
{
idxBn = lut_startAddrBnProcBuf_CNG9[j][i] + lut_bnPosBnProcBuf_CNG9[j][i]*Z;
nrLDPC_inv_circ_memcpy(&bnProcBuf[idxBn],p_cnProcBufRes,Z,lut_circShift_CNG9[j][i]);
p_cnProcBufRes += Z;
}
}
// =====================================================================
// CN group with 10 BNs
bitOffsetInGroup = lut_numCnInCnGroups_BG1_R13[7]*NR_LDPC_ZMAX;
for (j=0; j<10; j++)
{
p_cnProcBufRes = &cnProcBufRes[lut_startAddrCnGroups[7] + j*bitOffsetInGroup];
for (i=0; i<lut_numCnInCnGroups[7]; i++)
{
idxBn = lut_startAddrBnProcBuf_CNG10[j][i] + lut_bnPosBnProcBuf_CNG10[j][i]*Z;
nrLDPC_inv_circ_memcpy(&bnProcBuf[idxBn],p_cnProcBufRes,Z,lut_circShift_CNG10[j][i]);
p_cnProcBufRes += Z;
}
}
// =====================================================================
// CN group with 19 BNs
bitOffsetInGroup = lut_numCnInCnGroups_BG1_R13[8]*NR_LDPC_ZMAX;
for (j=0; j<19; j++)
{
p_cnProcBufRes = &cnProcBufRes[lut_startAddrCnGroups[8] + j*bitOffsetInGroup];
for (i=0; i<lut_numCnInCnGroups[8]; i++)
{
idxBn = lut_startAddrBnProcBuf_CNG19[j][i] + lut_bnPosBnProcBuf_CNG19[j][i]*Z;
nrLDPC_inv_circ_memcpy(&bnProcBuf[idxBn],p_cnProcBufRes,Z,lut_circShift_CNG19[j][i]);
p_cnProcBufRes += Z;
}
}
}
/**
\brief Copies the values in the BN processing results buffer to their corresponding place in the CN processing buffer for BG2.
\param p_lut Pointer to decoder LUTs
\param p_procBuf Pointer to the processing buffers
\param Z Lifting size
*/
static inline void nrLDPC_bn2cnProcBuf_BG2(t_nrLDPC_lut* p_lut, t_nrLDPC_procBuf* p_procBuf, uint16_t Z)
{
const uint8_t* lut_numCnInCnGroups = p_lut->numCnInCnGroups;
const uint32_t* lut_startAddrCnGroups = p_lut->startAddrCnGroups;
const uint16_t (*lut_circShift_CNG3) [lut_numCnInCnGroups_BG2_R15[0]] = (uint16_t(*)[lut_numCnInCnGroups_BG2_R15[0]]) p_lut->circShift[0];
const uint16_t (*lut_circShift_CNG4) [lut_numCnInCnGroups_BG2_R15[1]] = (uint16_t(*)[lut_numCnInCnGroups_BG2_R15[1]]) p_lut->circShift[1];
const uint16_t (*lut_circShift_CNG5) [lut_numCnInCnGroups_BG2_R15[2]] = (uint16_t(*)[lut_numCnInCnGroups_BG2_R15[2]]) p_lut->circShift[2];
const uint16_t (*lut_circShift_CNG6) [lut_numCnInCnGroups_BG2_R15[3]] = (uint16_t(*)[lut_numCnInCnGroups_BG2_R15[3]]) p_lut->circShift[3];
const uint16_t (*lut_circShift_CNG8) [lut_numCnInCnGroups_BG2_R15[4]] = (uint16_t(*)[lut_numCnInCnGroups_BG2_R15[4]]) p_lut->circShift[4];
const uint16_t (*lut_circShift_CNG10) [lut_numCnInCnGroups_BG2_R15[5]] = (uint16_t(*)[lut_numCnInCnGroups_BG2_R15[5]]) p_lut->circShift[5];
const uint32_t (*lut_startAddrBnProcBuf_CNG3) [lut_numCnInCnGroups[0]] = (uint32_t(*)[lut_numCnInCnGroups[0]]) p_lut->startAddrBnProcBuf[0];
const uint32_t (*lut_startAddrBnProcBuf_CNG4) [lut_numCnInCnGroups[1]] = (uint32_t(*)[lut_numCnInCnGroups[1]]) p_lut->startAddrBnProcBuf[1];
const uint32_t (*lut_startAddrBnProcBuf_CNG5) [lut_numCnInCnGroups[2]] = (uint32_t(*)[lut_numCnInCnGroups[2]]) p_lut->startAddrBnProcBuf[2];
const uint32_t (*lut_startAddrBnProcBuf_CNG6) [lut_numCnInCnGroups[3]] = (uint32_t(*)[lut_numCnInCnGroups[3]]) p_lut->startAddrBnProcBuf[3];
const uint32_t (*lut_startAddrBnProcBuf_CNG8) [lut_numCnInCnGroups[4]] = (uint32_t(*)[lut_numCnInCnGroups[4]]) p_lut->startAddrBnProcBuf[4];
const uint32_t (*lut_startAddrBnProcBuf_CNG10) [lut_numCnInCnGroups[5]] = (uint32_t(*)[lut_numCnInCnGroups[5]]) p_lut->startAddrBnProcBuf[5];
const uint8_t (*lut_bnPosBnProcBuf_CNG3) [lut_numCnInCnGroups[0]] = (uint8_t(*)[lut_numCnInCnGroups[0]]) p_lut->bnPosBnProcBuf[0];
const uint8_t (*lut_bnPosBnProcBuf_CNG4) [lut_numCnInCnGroups[1]] = (uint8_t(*)[lut_numCnInCnGroups[1]]) p_lut->bnPosBnProcBuf[1];
const uint8_t (*lut_bnPosBnProcBuf_CNG5) [lut_numCnInCnGroups[2]] = (uint8_t(*)[lut_numCnInCnGroups[2]]) p_lut->bnPosBnProcBuf[2];
const uint8_t (*lut_bnPosBnProcBuf_CNG6) [lut_numCnInCnGroups[3]] = (uint8_t(*)[lut_numCnInCnGroups[3]]) p_lut->bnPosBnProcBuf[3];
const uint8_t (*lut_bnPosBnProcBuf_CNG8) [lut_numCnInCnGroups[4]] = (uint8_t(*)[lut_numCnInCnGroups[4]]) p_lut->bnPosBnProcBuf[4];
const uint8_t (*lut_bnPosBnProcBuf_CNG10) [lut_numCnInCnGroups[5]] = (uint8_t(*)[lut_numCnInCnGroups[5]]) p_lut->bnPosBnProcBuf[5];
int8_t* cnProcBuf = p_procBuf->cnProcBuf;
int8_t* bnProcBufRes = p_procBuf->bnProcBufRes;
int8_t* p_cnProcBuf;
uint32_t bitOffsetInGroup;
uint32_t i;
uint32_t j;
uint32_t idxBn = 0;
// For CN groups 3 to 6 no need to send the last BN back since it's single edge
// and BN processing does not change the value already in the CN proc buf
// =====================================================================
// CN group with 3 BNs
bitOffsetInGroup = lut_numCnInCnGroups_BG2_R15[0]*NR_LDPC_ZMAX;
for (j=0; j<2; j++)
{
p_cnProcBuf = &cnProcBuf[lut_startAddrCnGroups[0] + j*bitOffsetInGroup];
for (i=0; i<lut_numCnInCnGroups[0]; i++)
{
idxBn = lut_startAddrBnProcBuf_CNG3[j][i] + lut_bnPosBnProcBuf_CNG3[j][i]*Z;
nrLDPC_circ_memcpy(p_cnProcBuf, &bnProcBufRes[idxBn], Z, lut_circShift_CNG3[j][i]);
p_cnProcBuf += Z;
}
}
// =====================================================================
// CN group with 4 BNs
bitOffsetInGroup = lut_numCnInCnGroups_BG2_R15[1]*NR_LDPC_ZMAX;
for (j=0; j<3; j++)
{
p_cnProcBuf = &cnProcBuf[lut_startAddrCnGroups[1] + j*bitOffsetInGroup];
for (i=0; i<lut_numCnInCnGroups[1]; i++)
{
idxBn = lut_startAddrBnProcBuf_CNG4[j][i] + lut_bnPosBnProcBuf_CNG4[j][i]*Z;
nrLDPC_circ_memcpy(p_cnProcBuf, &bnProcBufRes[idxBn], Z, lut_circShift_CNG4[j][i]);
p_cnProcBuf += Z;
}
}
// =====================================================================
// CN group with 5 BNs
bitOffsetInGroup = lut_numCnInCnGroups_BG2_R15[2]*NR_LDPC_ZMAX;
for (j=0; j<4; j++)
{
p_cnProcBuf = &cnProcBuf[lut_startAddrCnGroups[2] + j*bitOffsetInGroup];
for (i=0; i<lut_numCnInCnGroups[2]; i++)
{
idxBn = lut_startAddrBnProcBuf_CNG5[j][i] + lut_bnPosBnProcBuf_CNG5[j][i]*Z;
nrLDPC_circ_memcpy(p_cnProcBuf, &bnProcBufRes[idxBn], Z, lut_circShift_CNG5[j][i]);
p_cnProcBuf += Z;
}
}
// =====================================================================
// CN group with 6 BNs
bitOffsetInGroup = lut_numCnInCnGroups_BG2_R15[3]*NR_LDPC_ZMAX;
for (j=0; j<5; j++)
{
p_cnProcBuf = &cnProcBuf[lut_startAddrCnGroups[3] + j*bitOffsetInGroup];
for (i=0; i<lut_numCnInCnGroups[3]; i++)
{
idxBn = lut_startAddrBnProcBuf_CNG6[j][i] + lut_bnPosBnProcBuf_CNG6[j][i]*Z;
nrLDPC_circ_memcpy(p_cnProcBuf, &bnProcBufRes[idxBn], Z, lut_circShift_CNG6[j][i]);
p_cnProcBuf += Z;
}
}
// =====================================================================
// CN group with 8 BNs
bitOffsetInGroup = lut_numCnInCnGroups_BG2_R15[4]*NR_LDPC_ZMAX;
for (j=0; j<8; j++)
{
p_cnProcBuf = &cnProcBuf[lut_startAddrCnGroups[4] + j*bitOffsetInGroup];
for (i=0; i<lut_numCnInCnGroups[4]; i++)
{
idxBn = lut_startAddrBnProcBuf_CNG8[j][i] + lut_bnPosBnProcBuf_CNG8[j][i]*Z;
nrLDPC_circ_memcpy(p_cnProcBuf, &bnProcBufRes[idxBn], Z, lut_circShift_CNG8[j][i]);
p_cnProcBuf += Z;
}
}
// =====================================================================
// CN group with 10 BNs
bitOffsetInGroup = lut_numCnInCnGroups_BG2_R15[5]*NR_LDPC_ZMAX;
for (j=0; j<10; j++)
{
p_cnProcBuf = &cnProcBuf[lut_startAddrCnGroups[5] + j*bitOffsetInGroup];
for (i=0; i<lut_numCnInCnGroups[5]; i++)
{
idxBn = lut_startAddrBnProcBuf_CNG10[j][i] + lut_bnPosBnProcBuf_CNG10[j][i]*Z;
nrLDPC_circ_memcpy(p_cnProcBuf, &bnProcBufRes[idxBn], Z, lut_circShift_CNG10[j][i]);
p_cnProcBuf += Z;
}
}
}
/**
\brief Copies the values in the BN processing results buffer to their corresponding place in the CN processing buffer for BG1.
\param p_lut Pointer to decoder LUTs
\param p_procBuf Pointer to the processing buffers
\param Z Lifting size
*/
static inline void nrLDPC_bn2cnProcBuf_BG1(t_nrLDPC_lut* p_lut, t_nrLDPC_procBuf* p_procBuf, uint16_t Z)
{
const uint8_t* lut_numCnInCnGroups = p_lut->numCnInCnGroups;
const uint32_t* lut_startAddrCnGroups = p_lut->startAddrCnGroups;
const uint16_t (*lut_circShift_CNG3) [lut_numCnInCnGroups_BG1_R13[0]] = (uint16_t(*)[lut_numCnInCnGroups_BG1_R13[0]]) p_lut->circShift[0];
const uint16_t (*lut_circShift_CNG4) [lut_numCnInCnGroups_BG1_R13[1]] = (uint16_t(*)[lut_numCnInCnGroups_BG1_R13[1]]) p_lut->circShift[1];
const uint16_t (*lut_circShift_CNG5) [lut_numCnInCnGroups_BG1_R13[2]] = (uint16_t(*)[lut_numCnInCnGroups_BG1_R13[2]]) p_lut->circShift[2];
const uint16_t (*lut_circShift_CNG6) [lut_numCnInCnGroups_BG1_R13[3]] = (uint16_t(*)[lut_numCnInCnGroups_BG1_R13[3]]) p_lut->circShift[3];
const uint16_t (*lut_circShift_CNG7) [lut_numCnInCnGroups_BG1_R13[4]] = (uint16_t(*)[lut_numCnInCnGroups_BG1_R13[4]]) p_lut->circShift[4];
const uint16_t (*lut_circShift_CNG8) [lut_numCnInCnGroups_BG1_R13[5]] = (uint16_t(*)[lut_numCnInCnGroups_BG1_R13[5]]) p_lut->circShift[5];
const uint16_t (*lut_circShift_CNG9) [lut_numCnInCnGroups_BG1_R13[6]] = (uint16_t(*)[lut_numCnInCnGroups_BG1_R13[6]]) p_lut->circShift[6];
const uint16_t (*lut_circShift_CNG10)[lut_numCnInCnGroups_BG1_R13[7]] = (uint16_t(*)[lut_numCnInCnGroups_BG1_R13[7]]) p_lut->circShift[7];
const uint16_t (*lut_circShift_CNG19)[lut_numCnInCnGroups_BG1_R13[8]] = (uint16_t(*)[lut_numCnInCnGroups_BG1_R13[8]]) p_lut->circShift[8];
const uint32_t (*lut_startAddrBnProcBuf_CNG3) [lut_numCnInCnGroups[0]] = (uint32_t(*)[lut_numCnInCnGroups[0]]) p_lut->startAddrBnProcBuf[0];
const uint32_t (*lut_startAddrBnProcBuf_CNG4) [lut_numCnInCnGroups[1]] = (uint32_t(*)[lut_numCnInCnGroups[1]]) p_lut->startAddrBnProcBuf[1];
const uint32_t (*lut_startAddrBnProcBuf_CNG5) [lut_numCnInCnGroups[2]] = (uint32_t(*)[lut_numCnInCnGroups[2]]) p_lut->startAddrBnProcBuf[2];
const uint32_t (*lut_startAddrBnProcBuf_CNG6) [lut_numCnInCnGroups[3]] = (uint32_t(*)[lut_numCnInCnGroups[3]]) p_lut->startAddrBnProcBuf[3];
const uint32_t (*lut_startAddrBnProcBuf_CNG7) [lut_numCnInCnGroups[4]] = (uint32_t(*)[lut_numCnInCnGroups[4]]) p_lut->startAddrBnProcBuf[4];
const uint32_t (*lut_startAddrBnProcBuf_CNG8) [lut_numCnInCnGroups[5]] = (uint32_t(*)[lut_numCnInCnGroups[5]]) p_lut->startAddrBnProcBuf[5];
const uint32_t (*lut_startAddrBnProcBuf_CNG9) [lut_numCnInCnGroups[6]] = (uint32_t(*)[lut_numCnInCnGroups[6]]) p_lut->startAddrBnProcBuf[6];
const uint32_t (*lut_startAddrBnProcBuf_CNG10)[lut_numCnInCnGroups[7]] = (uint32_t(*)[lut_numCnInCnGroups[7]]) p_lut->startAddrBnProcBuf[7];
const uint32_t (*lut_startAddrBnProcBuf_CNG19)[lut_numCnInCnGroups[8]] = (uint32_t(*)[lut_numCnInCnGroups[8]]) p_lut->startAddrBnProcBuf[8];
const uint8_t (*lut_bnPosBnProcBuf_CNG4) [lut_numCnInCnGroups[1]] = (uint8_t(*)[lut_numCnInCnGroups[1]]) p_lut->bnPosBnProcBuf[1];
const uint8_t (*lut_bnPosBnProcBuf_CNG5) [lut_numCnInCnGroups[2]] = (uint8_t(*)[lut_numCnInCnGroups[2]]) p_lut->bnPosBnProcBuf[2];
const uint8_t (*lut_bnPosBnProcBuf_CNG6) [lut_numCnInCnGroups[3]] = (uint8_t(*)[lut_numCnInCnGroups[3]]) p_lut->bnPosBnProcBuf[3];
const uint8_t (*lut_bnPosBnProcBuf_CNG7) [lut_numCnInCnGroups[4]] = (uint8_t(*)[lut_numCnInCnGroups[4]]) p_lut->bnPosBnProcBuf[4];
const uint8_t (*lut_bnPosBnProcBuf_CNG8) [lut_numCnInCnGroups[5]] = (uint8_t(*)[lut_numCnInCnGroups[5]]) p_lut->bnPosBnProcBuf[5];
const uint8_t (*lut_bnPosBnProcBuf_CNG9) [lut_numCnInCnGroups[6]] = (uint8_t(*)[lut_numCnInCnGroups[6]]) p_lut->bnPosBnProcBuf[6];
const uint8_t (*lut_bnPosBnProcBuf_CNG10)[lut_numCnInCnGroups[7]] = (uint8_t(*)[lut_numCnInCnGroups[7]]) p_lut->bnPosBnProcBuf[7];
const uint8_t (*lut_bnPosBnProcBuf_CNG19)[lut_numCnInCnGroups[8]] = (uint8_t(*)[lut_numCnInCnGroups[8]]) p_lut->bnPosBnProcBuf[8];
int8_t* cnProcBuf = p_procBuf->cnProcBuf;
int8_t* bnProcBufRes = p_procBuf->bnProcBufRes;
int8_t* p_cnProcBuf;
uint32_t bitOffsetInGroup;
uint32_t i;
uint32_t j;
uint32_t idxBn = 0;
// For CN groups 3 to 19 no need to send the last BN back since it's single edge
// and BN processing does not change the value already in the CN proc buf
// =====================================================================
// CN group with 3 BNs
bitOffsetInGroup = lut_numCnInCnGroups_BG1_R13[0]*NR_LDPC_ZMAX;
// #pragma omp simd
for (j=0;j<2; j++)
{
p_cnProcBuf = &cnProcBuf[lut_startAddrCnGroups[0] + j*bitOffsetInGroup];
nrLDPC_circ_memcpy(p_cnProcBuf, &bnProcBufRes[lut_startAddrBnProcBuf_CNG3[j][0]], Z, lut_circShift_CNG3[j][0]);
}
// =====================================================================
// CN group with 4 BNs
bitOffsetInGroup = lut_numCnInCnGroups_BG1_R13[1]*NR_LDPC_ZMAX;
// #pragma omp simd
for (j=0; j<3; j++)
{
p_cnProcBuf = &cnProcBuf[lut_startAddrCnGroups[1] + j*bitOffsetInGroup];
// #pragma omp simd
for (i=0; i<lut_numCnInCnGroups[1]; i++)
{
idxBn = lut_startAddrBnProcBuf_CNG4[j][i] + lut_bnPosBnProcBuf_CNG4[j][i]*Z;
nrLDPC_circ_memcpy(p_cnProcBuf, &bnProcBufRes[idxBn], Z, lut_circShift_CNG4[j][i]);
p_cnProcBuf += Z;
}
}
// =====================================================================
// CN group with 5 BNs
bitOffsetInGroup = lut_numCnInCnGroups_BG1_R13[2]*NR_LDPC_ZMAX;
// #pragma omp simd
for (j=0; j<4; j++)
{
p_cnProcBuf = &cnProcBuf[lut_startAddrCnGroups[2] + j*bitOffsetInGroup];
#pragma omp simd
for (i=0; i<lut_numCnInCnGroups[2]; i++)
{
idxBn = lut_startAddrBnProcBuf_CNG5[j][i] + lut_bnPosBnProcBuf_CNG5[j][i]*Z;
nrLDPC_circ_memcpy(p_cnProcBuf, &bnProcBufRes[idxBn], Z, lut_circShift_CNG5[j][i]);
p_cnProcBuf += Z;
}
}
// =====================================================================
// CN group with 6 BNs
bitOffsetInGroup = lut_numCnInCnGroups_BG1_R13[3]*NR_LDPC_ZMAX;
//#pragma omp simd
for (j=0; j<5; j++)
{
p_cnProcBuf = &cnProcBuf[lut_startAddrCnGroups[3] + j*bitOffsetInGroup];
// #pragma omp simd
for (i=0; i<lut_numCnInCnGroups[3]; i++)
{
idxBn = lut_startAddrBnProcBuf_CNG6[j][i] + lut_bnPosBnProcBuf_CNG6[j][i]*Z;
nrLDPC_circ_memcpy(p_cnProcBuf, &bnProcBufRes[idxBn], Z, lut_circShift_CNG6[j][i]);
p_cnProcBuf += Z;
}
}
// =====================================================================
// CN group with 7 BNs
bitOffsetInGroup = lut_numCnInCnGroups_BG1_R13[4]*NR_LDPC_ZMAX;
//#pragma omp simd
for (j=0; j<6; j++)
{
p_cnProcBuf = &cnProcBuf[lut_startAddrCnGroups[4] + j*bitOffsetInGroup];
// #pragma omp simd
for (i=0; i<lut_numCnInCnGroups[4]; i++)
{
idxBn = lut_startAddrBnProcBuf_CNG7[j][i] + lut_bnPosBnProcBuf_CNG7[j][i]*Z;
nrLDPC_circ_memcpy(p_cnProcBuf, &bnProcBufRes[idxBn], Z, lut_circShift_CNG7[j][i]);
p_cnProcBuf += Z;
}
}
// =====================================================================
// CN group with 8 BNs
bitOffsetInGroup = lut_numCnInCnGroups_BG1_R13[5]*NR_LDPC_ZMAX;
// #pragma omp simd
for (j=0; j<7; j++)
{
p_cnProcBuf = &cnProcBuf[lut_startAddrCnGroups[5] + j*bitOffsetInGroup];
for (i=0; i<lut_numCnInCnGroups[5]; i++)
{
idxBn = lut_startAddrBnProcBuf_CNG8[j][i] + lut_bnPosBnProcBuf_CNG8[j][i]*Z;
nrLDPC_circ_memcpy(p_cnProcBuf, &bnProcBufRes[idxBn], Z, lut_circShift_CNG8[j][i]);
p_cnProcBuf += Z;
}
}
// =====================================================================
// CN group with 9 BNs
bitOffsetInGroup = lut_numCnInCnGroups_BG1_R13[6]*NR_LDPC_ZMAX;
// #pragma omp simd
for (j=0; j<8; j++)
{
p_cnProcBuf = &cnProcBuf[lut_startAddrCnGroups[6] + j*bitOffsetInGroup];
// #pragma omp simd
for (i=0; i<lut_numCnInCnGroups[6]; i++)
{
idxBn = lut_startAddrBnProcBuf_CNG9[j][i] + lut_bnPosBnProcBuf_CNG9[j][i]*Z;
nrLDPC_circ_memcpy(p_cnProcBuf, &bnProcBufRes[idxBn], Z, lut_circShift_CNG9[j][i]);
p_cnProcBuf += Z;
}
}
// =====================================================================
// CN group with 10 BNs
bitOffsetInGroup = lut_numCnInCnGroups_BG1_R13[7]*NR_LDPC_ZMAX;
//#pragma omp simd
for (j=0; j<9; j++)
{
p_cnProcBuf = &cnProcBuf[lut_startAddrCnGroups[7] + j*bitOffsetInGroup];
// #pragma omp simd
for (i=0; i<lut_numCnInCnGroups[7]; i++)
{
idxBn = lut_startAddrBnProcBuf_CNG10[j][i] + lut_bnPosBnProcBuf_CNG10[j][i]*Z;
nrLDPC_circ_memcpy(p_cnProcBuf, &bnProcBufRes[idxBn], Z, lut_circShift_CNG10[j][i]);
p_cnProcBuf += Z;
}
}
// =====================================================================
// CN group with 19 BNs
bitOffsetInGroup = lut_numCnInCnGroups_BG1_R13[8]*NR_LDPC_ZMAX;
//#pragma omp simd
for (j=0; j<19; j++)
{
p_cnProcBuf = &cnProcBuf[lut_startAddrCnGroups[8] + j*bitOffsetInGroup];
// #pragma omp simd
for (i=0; i<lut_numCnInCnGroups[8]; i++)
{
idxBn = lut_startAddrBnProcBuf_CNG19[j][i] + lut_bnPosBnProcBuf_CNG19[j][i]*Z;
nrLDPC_circ_memcpy(p_cnProcBuf, &bnProcBufRes[idxBn], Z, lut_circShift_CNG19[j][i]);
p_cnProcBuf += Z;
}
}
}
/**
\brief Copies the values in the LLR results buffer to their corresponding place in the output LLR vector.
\param p_lut Pointer to decoder LUTs
\param llrOut Pointer to output LLRs
\param p_procBuf Pointer to the processing buffers
\param Z Lifting size
\param BG Base graph
*/
static inline void nrLDPC_llrRes2llrOut(t_nrLDPC_lut* p_lut, int8_t* llrOut, t_nrLDPC_procBuf* p_procBuf, uint16_t Z, uint8_t BG)
{
uint32_t i;
const uint8_t numBn2CnG1 = p_lut->numBnInBnGroups[0];
uint32_t startColParity = (BG ==1 ) ? (NR_LDPC_START_COL_PARITY_BG1) : (NR_LDPC_START_COL_PARITY_BG2);
uint32_t colG1 = startColParity*Z;
const uint16_t* lut_llr2llrProcBufAddr = p_lut->llr2llrProcBufAddr;
const uint8_t* lut_llr2llrProcBufBnPos = p_lut->llr2llrProcBufBnPos;
int8_t* llrRes = p_procBuf->llrRes;
int8_t* p_llrOut = &llrOut[0];
uint32_t idxBn;
// Copy LLRs connected to 1 CN
if (numBn2CnG1 > 0)
{
memcpy(&llrOut[colG1], llrRes, numBn2CnG1*Z);
}
for (i=0; i<startColParity; i++)
{
idxBn = lut_llr2llrProcBufAddr[i] + lut_llr2llrProcBufBnPos[i]*Z;
memcpy(p_llrOut, &llrRes[idxBn], Z);
p_llrOut += Z;
}
}
#endif
openair1/PHY/CODING/nrLDPC_decoder/nrLDPC_tools/generator_cnProc/cnProc_gen_BG1_avx2.c
View file @
f6bb869c
...
...
@@ -96,7 +96,7 @@ void nrLDPC_cnProc_BG1_generator_AVX2(const char* dir, int R)
// for (i=0; i<M; i++,iprime++)
// {
fprintf
(
fd
,
" for (int i=0;i<M;i+
=2
) {
\n
"
);
fprintf
(
fd
,
" for (int i=0;i<M;i+
+
) {
\n
"
);
// Abs and sign of 32 CNs (first BN)
// ymm0 = p_cnProcBuf[lut_idxCnProcG3[j][0] + i];
fprintf
(
fd
,
" ymm0 = ((__m256i*)cnProcBuf)[%d+i];
\n
"
,(
lut_startAddrCnGroups
[
0
]
>>
5
)
+
lut_idxCnProcG3
[
j
][
0
]);
...
...
Write
Preview
Markdown
is supported
0%
Try again
or
attach a new file
Attach a file
Cancel
You are about to add
0
people
to the discussion. Proceed with caution.
Finish editing this message first!
Cancel
Please
register
or
sign in
to comment