Commit 3ed622af authored by Sy's avatar Sy

use of avx512 at CN processing level

parent a7e61374
......@@ -2745,8 +2745,8 @@ static inline void nrLDPC_llr2bit(int8_t* out, int8_t* llrOut, uint16_t numLLR)
uint32_t M = numLLR>>5;
uint32_t Mr = numLLR&31;
const __m256i* p_zeros = (__m256i*) zeros256_epi8;
const __m256i* p_ones = (__m256i*) ones256_epi8;
const __m256i* p_zeros = (__m256i*) zeros512_epi8;
const __m256i* p_ones = (__m256i*) ones512_epi8;
for (i=0; i<M; i++)
{
......
......@@ -34,15 +34,15 @@
#include <stdlib.h>
#include "nrLDPC_types.h"
#ifndef malloc32_clear
#ifndef malloc64_clear
/**
\brief Allocates 32 byte aligned memory and initializes to zero
\brief Allocates 64 byte aligned memory and initializes to zero
\param size Input size in bytes
\return Pointer to memory
*/
static inline void* malloc32_clear(size_t size)
static inline void* malloc64_clear(size_t size)
{
void* ptr = (void*) memalign(32, size+32);
void* ptr = (void*) memalign(64, size+64);
memset(ptr, 0, size);
return ptr;
}
......@@ -56,16 +56,16 @@ static inline void* malloc32_clear(size_t size)
*/
static inline t_nrLDPC_procBuf* nrLDPC_init_mem(void)
{
t_nrLDPC_procBuf* p_procBuf = (t_nrLDPC_procBuf*) malloc32_clear(sizeof(t_nrLDPC_procBuf));
t_nrLDPC_procBuf* p_procBuf = (t_nrLDPC_procBuf*) malloc64_clear(sizeof(t_nrLDPC_procBuf));
if (p_procBuf)
{
p_procBuf->cnProcBuf = (int8_t*) malloc32_clear(NR_LDPC_SIZE_CN_PROC_BUF*sizeof(int8_t));
p_procBuf->cnProcBufRes = (int8_t*) malloc32_clear(NR_LDPC_SIZE_CN_PROC_BUF*sizeof(int8_t));
p_procBuf->bnProcBuf = (int8_t*) malloc32_clear(NR_LDPC_SIZE_BN_PROC_BUF*sizeof(int8_t));
p_procBuf->bnProcBufRes = (int8_t*) malloc32_clear(NR_LDPC_SIZE_BN_PROC_BUF*sizeof(int8_t));
p_procBuf->llrRes = (int8_t*) malloc32_clear(NR_LDPC_MAX_NUM_LLR *sizeof(int8_t));
p_procBuf->llrProcBuf = (int8_t*) malloc32_clear(NR_LDPC_MAX_NUM_LLR *sizeof(int8_t));
p_procBuf->cnProcBuf = (int8_t*) malloc64_clear(NR_LDPC_SIZE_CN_PROC_BUF*sizeof(int8_t));
p_procBuf->cnProcBufRes = (int8_t*) malloc64_clear(NR_LDPC_SIZE_CN_PROC_BUF*sizeof(int8_t));
p_procBuf->bnProcBuf = (int8_t*) malloc64_clear(NR_LDPC_SIZE_BN_PROC_BUF*sizeof(int8_t));
p_procBuf->bnProcBufRes = (int8_t*) malloc64_clear(NR_LDPC_SIZE_BN_PROC_BUF*sizeof(int8_t));
p_procBuf->llrRes = (int8_t*) malloc64_clear(NR_LDPC_MAX_NUM_LLR *sizeof(int8_t));
p_procBuf->llrProcBuf = (int8_t*) malloc64_clear(NR_LDPC_MAX_NUM_LLR *sizeof(int8_t));
}
return(p_procBuf);
......
......@@ -2,9 +2,6 @@
#include <immintrin.h>
#include "../nrLDPCdecoder_defs.h"
#include "../nrLDPC_types.h"
//#include "../nrLDPC_init.h"
//#include "../nrLDPC_mPass.h"
//#include "nrLDPC_cnProc.h"
#include "../nrLDPC_bnProc.h"
#include "cnProc_gen.h"
......@@ -24,6 +21,16 @@ void nrLDPC_cnProc_BG1_generator(uint16_t Z,int R)
fprintf(fd,"#include <stdint.h>\n");
fprintf(fd,"#include <immintrin.h>\n");
fprintf(fd, "__m512i _mm512_sign_epi16(__m512i a, __m512i b){ \n"); /* Emulate _mm512_sign_epi16() with instructions that exist in the AVX-512 instruction set */
fprintf(fd, "b = _mm512_min_epi16(b, _mm512_set1_epi16(1)); \n" );
fprintf(fd, "b = _mm512_max_epi16(b, _mm512_set1_epi16(-1)); \n" );
fprintf(fd, " a = _mm512_mullo_epi16(a, b);\n");
fprintf(fd, "return a;\n");
fprintf(fd, "}\n" );
fprintf(fd,"void nrLDPC_cnProc_BG1_Z%d_%s(int8_t* cnProcBuf,int8_t* cnProcBufRes) {\n",Z,ratestr[R]);
const uint8_t* lut_numCnInCnGroups;
......@@ -34,8 +41,8 @@ void nrLDPC_cnProc_BG1_generator(uint16_t Z,int R)
else if (R==2) lut_numCnInCnGroups = lut_numCnInCnGroups_BG1_R89;
else { printf("aborting, illegal R %d\n",R); fclose(fd);abort();}
//__m256i* p_cnProcBuf;
//__m256i* p_cnProcBufRes;
//__m512i* p_cnProcBuf;
//__m512i* p_cnProcBufRes;
// Number of CNs in Groups
uint32_t M;
......@@ -44,15 +51,15 @@ void nrLDPC_cnProc_BG1_generator(uint16_t Z,int R)
// Offset to each bit within a group in terms of 32 Byte
uint32_t bitOffsetInGroup;
//__m256i ymm0, min, sgn;
//__m256i* p_cnProcBufResBit;
//__m512i zmm0, min, sgn;
//__m512i* p_cnProcBufResBit;
// const __m256i* p_ones = (__m256i*) ones256_epi8;
// const __m256i* p_maxLLR = (__m256i*) maxLLR256_epi8;
// const __m512i* p_ones = (__m512i*) ones256_epi8;
// const __m512i* p_maxLLR = (__m512i*) maxLLR256_epi8;
// LUT with offsets for bits that need to be processed
// 1. bit proc requires LLRs of 2. and 3. bit, 2.bits of 1. and 3. etc.
// Offsets are in units of bitOffsetInGroup (1*384/32)
// Offsets are in units of bitOffsetInGroup (1*384/64)
// const uint8_t lut_idxCnProcG3[3][2] = {{12,24}, {0,24}, {0,12}};
// =====================================================================
......@@ -60,26 +67,29 @@ void nrLDPC_cnProc_BG1_generator(uint16_t Z,int R)
fprintf(fd,"//Process group with 3 BNs\n");
// LUT with offsets for bits that need to be processed
// 1. bit proc requires LLRs of 2. and 3. bit, 2.bits of 1. and 3. etc.
// Offsets are in units of bitOffsetInGroup (1*384/32)
const uint8_t lut_idxCnProcG3[3][2] = {{12,24}, {0,24}, {0,12}};
// Offsets are in units of bitOffsetInGroup (1*384/64)=6
// Offsets are in units of bitOffsetInGroup (1*384/64)=6
const uint8_t lut_idxCnProcG3[3][2] = {{6,12}, {0,12}, {0,6}};
fprintf(fd," __m256i ymm0, min, sgn,ones,maxLLR;\n");
fprintf(fd," ones = _mm256_set1_epi8((char)1);\n");
fprintf(fd," maxLLR = _mm256_set1_epi8((char)127);\n");
fprintf(fd," __m512i zmm0, min, sgn,ones,maxLLR;\n");
fprintf(fd," ones = _mm512_set1_epi8((char)1);\n");
fprintf(fd," maxLLR = _mm512_set1_epi8((char)127);\n");
if (lut_numCnInCnGroups[0] > 0)
{
// Number of groups of 32 CNs for parallel processing
// Ceil for values not divisible by 32
M = (lut_numCnInCnGroups[0]*Z + 31)>>5;
// Ceil for values not divisible by 64
M = (lut_numCnInCnGroups[0]*Z + 63)>>5;
// Set the offset to each bit within a group in terms of 32 Byte
bitOffsetInGroup = (lut_numCnInCnGroups_BG1_R13[0]*NR_LDPC_ZMAX)>>5;
// Set pointers to start of group 3
//p_cnProcBuf = (__m256i*) &cnProcBuf [lut_startAddrCnGroups[0]];
//p_cnProcBufRes = (__m256i*) &cnProcBufRes[lut_startAddrCnGroups[0]];
//p_cnProcBuf = (__m512i*) &cnProcBuf [lut_startAddrCnGroups[0]];
//p_cnProcBufRes = (__m512i*) &cnProcBufRes[lut_startAddrCnGroups[0]];
// Loop over every BN
int iprime=0;
......@@ -94,54 +104,54 @@ void nrLDPC_cnProc_BG1_generator(uint16_t Z,int R)
fprintf(fd," for (int i=0;i<%d;i+=2) {\n",M);
// Abs and sign of 32 CNs (first BN)
// ymm0 = p_cnProcBuf[lut_idxCnProcG3[j][0] + i];
fprintf(fd," ymm0 = ((__m256i*)cnProcBuf)[%d+i];\n",(lut_startAddrCnGroups[0]>>5)+lut_idxCnProcG3[j][0]);
// sgn = _mm256_sign_epi8(ones, ymm0);
fprintf(fd," sgn = _mm256_sign_epi8(ones, ymm0);\n");
// min = _mm256_abs_epi8(ymm0);
fprintf(fd," min = _mm256_abs_epi8(ymm0);\n");
// zmm0 = p_cnProcBuf[lut_idxCnProcG3[j][0] + i];
fprintf(fd," zmm0 = ((__m512i*)cnProcBuf)[%d+i];\n",(lut_startAddrCnGroups[0]>>5)+lut_idxCnProcG3[j][0]);
// sgn = _mm512_sign_epi16(ones, zmm0);
fprintf(fd," sgn = _mm512_sign_epi16(ones, zmm0);\n");
// min = _mm512_abs_epi8(zmm0);
fprintf(fd," min = _mm512_abs_epi8(zmm0);\n");
// 32 CNs of second BN
// ymm0 = p_cnProcBuf[lut_idxCnProcG3[j][1] + i];
fprintf(fd," ymm0 = ((__m256i*)cnProcBuf)[%d+i];\n",(lut_startAddrCnGroups[0]>>5)+lut_idxCnProcG3[j][1]);
// zmm0 = p_cnProcBuf[lut_idxCnProcG3[j][1] + i];
fprintf(fd," zmm0 = ((__m512i*)cnProcBuf)[%d+i];\n",(lut_startAddrCnGroups[0]>>5)+lut_idxCnProcG3[j][1]);
// min = _mm256_min_epu8(min, _mm256_abs_epi8(ymm0));
fprintf(fd," min = _mm256_min_epu8(min, _mm256_abs_epi8(ymm0));\n");
// min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0));
fprintf(fd," min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0));\n");
// sgn = _mm256_sign_epi8(sgn, ymm0);
fprintf(fd," sgn = _mm256_sign_epi8(sgn, ymm0);\n");
// sgn = _mm512_sign_epi16(sgn, zmm0);
fprintf(fd," sgn = _mm512_sign_epi16(sgn, zmm0);\n");
// Store result
// min = _mm256_min_epu8(min, maxLLR); // 128 in epi8 is -127
fprintf(fd," min = _mm256_min_epu8(min, maxLLR);\n");
// *p_cnProcBufResBit = _mm256_sign_epi8(min, sgn);
// min = _mm512_min_epu8(min, maxLLR); // 128 in epi8 is -127
fprintf(fd," min = _mm512_min_epu8(min, maxLLR);\n");
// *p_cnProcBufResBit = _mm512_sign_epi16(min, sgn);
// p_cnProcBufResBit++;
fprintf(fd," ((__m256i*)cnProcBufRes)[%d+i] = _mm256_sign_epi8(min, sgn);\n",(lut_startAddrCnGroups[0]>>5)+(j*bitOffsetInGroup));
fprintf(fd," ((__m512i*)cnProcBufRes)[%d+i] = _mm512_sign_epi16(min, sgn);\n",(lut_startAddrCnGroups[0]>>5)+(j*bitOffsetInGroup));
// Abs and sign of 32 CNs (first BN)
// ymm0 = p_cnProcBuf[lut_idxCnProcG3[j][0] + i];
fprintf(fd," ymm0 = ((__m256i*)cnProcBuf)[%d+i];\n",(lut_startAddrCnGroups[0]>>5)+lut_idxCnProcG3[j][0]+1);
// sgn = _mm256_sign_epi8(ones, ymm0);
fprintf(fd," sgn = _mm256_sign_epi8(ones, ymm0);\n");
// min = _mm256_abs_epi8(ymm0);
fprintf(fd," min = _mm256_abs_epi8(ymm0);\n");
// zmm0 = p_cnProcBuf[lut_idxCnProcG3[j][0] + i];
fprintf(fd," zmm0 = ((__m512i*)cnProcBuf)[%d+i];\n",(lut_startAddrCnGroups[0]>>5)+lut_idxCnProcG3[j][0]+1);
// sgn = _mm512_sign_epi16(ones, zmm0);
fprintf(fd," sgn = _mm512_sign_epi16(ones, zmm0);\n");
// min = _mm512_abs_epi8(zmm0);
fprintf(fd," min = _mm512_abs_epi8(zmm0);\n");
// 32 CNs of second BN
// ymm0 = p_cnProcBuf[lut_idxCnProcG3[j][1] + i];
fprintf(fd," ymm0 = ((__m256i*)cnProcBuf)[%d+i];\n",(lut_startAddrCnGroups[0]>>5)+lut_idxCnProcG3[j][1]+1);
// zmm0 = p_cnProcBuf[lut_idxCnProcG3[j][1] + i];
fprintf(fd," zmm0 = ((__m512i*)cnProcBuf)[%d+i];\n",(lut_startAddrCnGroups[0]>>5)+lut_idxCnProcG3[j][1]+1);
// min = _mm256_min_epu8(min, _mm256_abs_epi8(ymm0));
fprintf(fd," min = _mm256_min_epu8(min, _mm256_abs_epi8(ymm0));\n");
// min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0));
fprintf(fd," min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0));\n");
// sgn = _mm256_sign_epi8(sgn, ymm0);
fprintf(fd," sgn = _mm256_sign_epi8(sgn, ymm0);\n");
// sgn = _mm512_sign_epi16(sgn, zmm0);
fprintf(fd," sgn = _mm512_sign_epi16(sgn, zmm0);\n");
// Store result
// min = _mm256_min_epu8(min, maxLLR); // 128 in epi8 is -127
fprintf(fd," min = _mm256_min_epu8(min, maxLLR);\n");
// *p_cnProcBufResBit = _mm256_sign_epi8(min, sgn);
// min = _mm512_min_epu8(min, maxLLR); // 128 in epi8 is -127
fprintf(fd," min = _mm512_min_epu8(min, maxLLR);\n");
// *p_cnProcBufResBit = _mm512_sign_epi16(min, sgn);
// p_cnProcBufResBit++;
fprintf(fd," ((__m256i*)cnProcBufRes)[%d+i] = _mm256_sign_epi8(min, sgn);\n",(lut_startAddrCnGroups[0]>>5)+(j*bitOffsetInGroup)+1);
fprintf(fd," ((__m512i*)cnProcBufRes)[%d+i] = _mm512_sign_epi16(min, sgn);\n",(lut_startAddrCnGroups[0]>>5)+(j*bitOffsetInGroup)+1);
fprintf(fd," }\n");
}
......@@ -150,22 +160,22 @@ void nrLDPC_cnProc_BG1_generator(uint16_t Z,int R)
// =====================================================================
// Process group with 4 BNs
fprintf(fd,"//Process group with 4 BNs\n");
// Offset is 5*384/32 = 60
const uint8_t lut_idxCnProcG4[4][3] = {{60,120,180}, {0,120,180}, {0,60,180}, {0,60,120}};
// Offset is 5*384/64 = 30
const uint8_t lut_idxCnProcG4[4][3] = {{30,60,90}, {0,60,90}, {0,30,90}, {0,30,60}};
if (lut_numCnInCnGroups[1] > 0)
{
// Number of groups of 32 CNs for parallel processing
// Ceil for values not divisible by 32
M = (lut_numCnInCnGroups[1]*Z + 31)>>5;
// Ceil for values not divisible by 64
M = (lut_numCnInCnGroups[1]*Z + 63)>>5;
// Set the offset to each bit within a group in terms of 32 Byte
bitOffsetInGroup = (lut_numCnInCnGroups_BG1_R13[1]*NR_LDPC_ZMAX)>>5;
// Set pointers to start of group 4
//p_cnProcBuf = (__m256i*) &cnProcBuf [lut_startAddrCnGroups[1]];
//p_cnProcBufRes = (__m256i*) &cnProcBufRes[lut_startAddrCnGroups[1]];
//p_cnProcBuf = (__m512i*) &cnProcBuf [lut_startAddrCnGroups[1]];
//p_cnProcBufRes = (__m512i*) &cnProcBufRes[lut_startAddrCnGroups[1]];
// Loop over every BN
int iprime=0;
......@@ -179,32 +189,32 @@ void nrLDPC_cnProc_BG1_generator(uint16_t Z,int R)
// {
fprintf(fd," for (int i=0;i<%d;i++) {\n",M);
// Abs and sign of 32 CNs (first BN)
// ymm0 = p_cnProcBuf[lut_idxCnProcG3[j][0] + i];
fprintf(fd," ymm0 = ((__m256i*)cnProcBuf)[%d+i];\n",(lut_startAddrCnGroups[1]>>5)+lut_idxCnProcG4[j][0]);
// sgn = _mm256_sign_epi8(ones, ymm0);
fprintf(fd," sgn = _mm256_sign_epi8(ones, ymm0);\n");
// min = _mm256_abs_epi8(ymm0);
fprintf(fd," min = _mm256_abs_epi8(ymm0);\n");
// zmm0 = p_cnProcBuf[lut_idxCnProcG3[j][0] + i];
fprintf(fd," zmm0 = ((__m512i*)cnProcBuf)[%d+i];\n",(lut_startAddrCnGroups[1]>>5)+lut_idxCnProcG4[j][0]);
// sgn = _mm512_sign_epi16(ones, zmm0);
fprintf(fd," sgn = _mm512_sign_epi16(ones, zmm0);\n");
// min = _mm512_abs_epi8(zmm0);
fprintf(fd," min = _mm512_abs_epi8(zmm0);\n");
// Loop over BNs
for (k=1; k<3; k++)
{
fprintf(fd," ymm0 = ((__m256i*)cnProcBuf)[%d+i];\n",(lut_startAddrCnGroups[1]>>5)+lut_idxCnProcG4[j][k]);
fprintf(fd," zmm0 = ((__m512i*)cnProcBuf)[%d+i];\n",(lut_startAddrCnGroups[1]>>5)+lut_idxCnProcG4[j][k]);
// min = _mm256_min_epu8(min, _mm256_abs_epi8(ymm0));
fprintf(fd," min = _mm256_min_epu8(min, _mm256_abs_epi8(ymm0));\n");
// min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0));
fprintf(fd," min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0));\n");
// sgn = _mm256_sign_epi8(sgn, ymm0);
fprintf(fd," sgn = _mm256_sign_epi8(sgn, ymm0);\n");
// sgn = _mm512_sign_epi16(sgn, zmm0);
fprintf(fd," sgn = _mm512_sign_epi16(sgn, zmm0);\n");
}
// Store result
// min = _mm256_min_epu8(min, maxLLR); // 128 in epi8 is -127
fprintf(fd," min = _mm256_min_epu8(min, maxLLR);\n");
// *p_cnProcBufResBit = _mm256_sign_epi8(min, sgn);
// min = _mm512_min_epu8(min, maxLLR); // 128 in epi8 is -127
fprintf(fd," min = _mm512_min_epu8(min, maxLLR);\n");
// *p_cnProcBufResBit = _mm512_sign_epi16(min, sgn);
// p_cnProcBufResBit++;
fprintf(fd," ((__m256i*)cnProcBufRes)[%d+i] = _mm256_sign_epi8(min, sgn);\n",(lut_startAddrCnGroups[1]>>5)+(j*bitOffsetInGroup));
fprintf(fd," ((__m512i*)cnProcBufRes)[%d+i] = _mm512_sign_epi16(min, sgn);\n",(lut_startAddrCnGroups[1]>>5)+(j*bitOffsetInGroup));
fprintf(fd," }\n");
}
}
......@@ -213,24 +223,24 @@ void nrLDPC_cnProc_BG1_generator(uint16_t Z,int R)
// =====================================================================
// Process group with 5 BNs
fprintf(fd,"//Process group with 5 BNs\n");
// Offset is 18*384/32 = 216
const uint16_t lut_idxCnProcG5[5][4] = {{216,432,648,864}, {0,432,648,864},
{0,216,648,864}, {0,216,432,864}, {0,216,432,648}};
// Offset is 18*384/64 = 216
const uint16_t lut_idxCnProcG5[5][4] = {{108,216,324,432}, {0,216,324,432},
{0,108,324,432}, {0,108,216,432}, {0,108,216,324}};
if (lut_numCnInCnGroups[2] > 0)
{
// Number of groups of 32 CNs for parallel processing
// Ceil for values not divisible by 32
M = (lut_numCnInCnGroups[2]*Z + 31)>>5;
// Ceil for values not divisible by 64
M = (lut_numCnInCnGroups[2]*Z + 63)>>5;
// Set the offset to each bit within a group in terms of 32 Byte
bitOffsetInGroup = (lut_numCnInCnGroups_BG1_R13[2]*NR_LDPC_ZMAX)>>5;
// Set pointers to start of group 4
//p_cnProcBuf = (__m256i*) &cnProcBuf [lut_startAddrCnGroups[1]];
//p_cnProcBufRes = (__m256i*) &cnProcBufRes[lut_startAddrCnGroups[1]];
//p_cnProcBuf = (__m512i*) &cnProcBuf [lut_startAddrCnGroups[1]];
//p_cnProcBufRes = (__m512i*) &cnProcBufRes[lut_startAddrCnGroups[1]];
// Loop over every BN
int iprime=0;
......@@ -244,32 +254,32 @@ void nrLDPC_cnProc_BG1_generator(uint16_t Z,int R)
// {
fprintf(fd," for (int i=0;i<%d;i++) {\n",M);
// Abs and sign of 32 CNs (first BN)
// ymm0 = p_cnProcBuf[lut_idxCnProcG3[j][0] + i];
fprintf(fd," ymm0 = ((__m256i*)cnProcBuf)[%d+i];\n",(lut_startAddrCnGroups[2]>>5)+lut_idxCnProcG5[j][0]);
// sgn = _mm256_sign_epi8(ones, ymm0);
fprintf(fd," sgn = _mm256_sign_epi8(ones, ymm0);\n");
// min = _mm256_abs_epi8(ymm0);
fprintf(fd," min = _mm256_abs_epi8(ymm0);\n");
// zmm0 = p_cnProcBuf[lut_idxCnProcG3[j][0] + i];
fprintf(fd," zmm0 = ((__m512i*)cnProcBuf)[%d+i];\n",(lut_startAddrCnGroups[2]>>5)+lut_idxCnProcG5[j][0]);
// sgn = _mm512_sign_epi16(ones, zmm0);
fprintf(fd," sgn = _mm512_sign_epi16(ones, zmm0);\n");
// min = _mm512_abs_epi8(zmm0);
fprintf(fd," min = _mm512_abs_epi8(zmm0);\n");
// Loop over BNs
for (k=1; k<4; k++)
{
fprintf(fd," ymm0 = ((__m256i*)cnProcBuf)[%d+i];\n",(lut_startAddrCnGroups[2]>>5)+lut_idxCnProcG5[j][k]);
fprintf(fd," zmm0 = ((__m512i*)cnProcBuf)[%d+i];\n",(lut_startAddrCnGroups[2]>>5)+lut_idxCnProcG5[j][k]);
// min = _mm256_min_epu8(min, _mm256_abs_epi8(ymm0));
fprintf(fd," min = _mm256_min_epu8(min, _mm256_abs_epi8(ymm0));\n");
// min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0));
fprintf(fd," min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0));\n");
// sgn = _mm256_sign_epi8(sgn, ymm0);
fprintf(fd," sgn = _mm256_sign_epi8(sgn, ymm0);\n");
// sgn = _mm512_sign_epi16(sgn, zmm0);
fprintf(fd," sgn = _mm512_sign_epi16(sgn, zmm0);\n");
}
// Store result
// min = _mm256_min_epu8(min, maxLLR); // 128 in epi8 is -127
fprintf(fd," min = _mm256_min_epu8(min, maxLLR);\n");
// *p_cnProcBufResBit = _mm256_sign_epi8(min, sgn);
// min = _mm512_min_epu8(min, maxLLR); // 128 in epi8 is -127
fprintf(fd," min = _mm512_min_epu8(min, maxLLR);\n");
// *p_cnProcBufResBit = _mm512_sign_epi16(min, sgn);
// p_cnProcBufResBit++;
fprintf(fd," ((__m256i*)cnProcBufRes)[%d+i] = _mm256_sign_epi8(min, sgn);\n",(lut_startAddrCnGroups[2]>>5)+(j*bitOffsetInGroup));
fprintf(fd," ((__m512i*)cnProcBufRes)[%d+i] = _mm512_sign_epi16(min, sgn);\n",(lut_startAddrCnGroups[2]>>5)+(j*bitOffsetInGroup));
fprintf(fd," }\n");
}
}
......@@ -277,25 +287,25 @@ void nrLDPC_cnProc_BG1_generator(uint16_t Z,int R)
// =====================================================================
// Process group with 6 BNs
fprintf(fd,"//Process group with 6 BNs\n");
// Offset is 8*384/32 = 96
const uint16_t lut_idxCnProcG6[6][5] = {{96,192,288,384,480}, {0,192,288,384,480},
{0,96,288,384,480}, {0,96,192,384,480},
{0,96,192,288,480}, {0,96,192,288,384}};
// Offset is 8*384/64 = 48
const uint16_t lut_idxCnProcG6[6][5] = {{48,96,144,192,240}, {0,96,144,192,240},
{0,48,144,192,240}, {0,48,96,192,240},
{0,48,96,144,240}, {0,48,96,144,192}};
if (lut_numCnInCnGroups[3] > 0)
{
// Number of groups of 32 CNs for parallel processing
// Ceil for values not divisible by 32
M = (lut_numCnInCnGroups[3]*Z + 31)>>5;
// Ceil for values not divisible by 64
M = (lut_numCnInCnGroups[3]*Z + 63)>>5;
// Set the offset to each bit within a group in terms of 32 Byte
bitOffsetInGroup = (lut_numCnInCnGroups_BG1_R13[3]*NR_LDPC_ZMAX)>>5;
// Set pointers to start of group 4
//p_cnProcBuf = (__m256i*) &cnProcBuf [lut_startAddrCnGroups[1]];
//p_cnProcBufRes = (__m256i*) &cnProcBufRes[lut_startAddrCnGroups[1]];
//p_cnProcBuf = (__m512i*) &cnProcBuf [lut_startAddrCnGroups[1]];
//p_cnProcBufRes = (__m512i*) &cnProcBufRes[lut_startAddrCnGroups[1]];
// Loop over every BN
int iprime=0;
......@@ -309,32 +319,32 @@ void nrLDPC_cnProc_BG1_generator(uint16_t Z,int R)
// {
fprintf(fd," for (int i=0;i<%d;i++) {\n",M);
// Abs and sign of 32 CNs (first BN)
// ymm0 = p_cnProcBuf[lut_idxCnProcG3[j][0] + i];
fprintf(fd," ymm0 = ((__m256i*)cnProcBuf)[%d+i];\n",(lut_startAddrCnGroups[3]>>5)+lut_idxCnProcG6[j][0]);
// sgn = _mm256_sign_epi8(ones, ymm0);
fprintf(fd," sgn = _mm256_sign_epi8(ones, ymm0);\n");
// min = _mm256_abs_epi8(ymm0);
fprintf(fd," min = _mm256_abs_epi8(ymm0);\n");
// zmm0 = p_cnProcBuf[lut_idxCnProcG3[j][0] + i];
fprintf(fd," zmm0 = ((__m512i*)cnProcBuf)[%d+i];\n",(lut_startAddrCnGroups[3]>>5)+lut_idxCnProcG6[j][0]);
// sgn = _mm512_sign_epi16(ones, zmm0);
fprintf(fd," sgn = _mm512_sign_epi16(ones, zmm0);\n");
// min = _mm512_abs_epi8(zmm0);
fprintf(fd," min = _mm512_abs_epi8(zmm0);\n");
// Loop over BNs
for (k=1; k<5; k++)
{
fprintf(fd," ymm0 = ((__m256i*)cnProcBuf)[%d+i];\n",(lut_startAddrCnGroups[3]>>5)+lut_idxCnProcG6[j][k]);
fprintf(fd," zmm0 = ((__m512i*)cnProcBuf)[%d+i];\n",(lut_startAddrCnGroups[3]>>5)+lut_idxCnProcG6[j][k]);
// min = _mm256_min_epu8(min, _mm256_abs_epi8(ymm0));
fprintf(fd," min = _mm256_min_epu8(min, _mm256_abs_epi8(ymm0));\n");
// min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0));
fprintf(fd," min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0));\n");
// sgn = _mm256_sign_epi8(sgn, ymm0);
fprintf(fd," sgn = _mm256_sign_epi8(sgn, ymm0);\n");
// sgn = _mm512_sign_epi16(sgn, zmm0);
fprintf(fd," sgn = _mm512_sign_epi16(sgn, zmm0);\n");
}
// Store result
// min = _mm256_min_epu8(min, maxLLR); // 128 in epi8 is -127
fprintf(fd," min = _mm256_min_epu8(min, maxLLR);\n");
// *p_cnProcBufResBit = _mm256_sign_epi8(min, sgn);
// min = _mm512_min_epu8(min, maxLLR); // 128 in epi8 is -127
fprintf(fd," min = _mm512_min_epu8(min, maxLLR);\n");
// *p_cnProcBufResBit = _mm512_sign_epi16(min, sgn);
// p_cnProcBufResBit++;
fprintf(fd," ((__m256i*)cnProcBufRes)[%d+i] = _mm256_sign_epi8(min, sgn);\n",(lut_startAddrCnGroups[3]>>5)+(j*bitOffsetInGroup));
fprintf(fd," ((__m512i*)cnProcBufRes)[%d+i] = _mm512_sign_epi16(min, sgn);\n",(lut_startAddrCnGroups[3]>>5)+(j*bitOffsetInGroup));
fprintf(fd," }\n");
}
}
......@@ -343,27 +353,26 @@ void nrLDPC_cnProc_BG1_generator(uint16_t Z,int R)
// =====================================================================
// Process group with 7 BNs
fprintf(fd,"//Process group with 7 BNs\n");
// Offset is 5*384/32 = 60
const uint16_t lut_idxCnProcG7[7][6] = {{60,120,180,240,300,360}, {0,120,180,240,300,360},
{0,60,180,240,300,360}, {0,60,120,240,300,360},
{0,60,120,180,300,360}, {0,60,120,180,240,360},
{0,60,120,180,240,300}};
// Offset is 5*384/64 = 30
const uint16_t lut_idxCnProcG7[7][6] = {{30,60,90,120,150,180}, {0,60,90,120,150,180},
{0,30,90,120,150,180}, {0,30,60,120,150,180},
{0,30,60,90,150,180}, {0,30,60,90,120,180},
{0,30,60,90,120,150}};
if (lut_numCnInCnGroups[4] > 0)
{
// Number of groups of 32 CNs for parallel processing
// Ceil for values not divisible by 32
M = (lut_numCnInCnGroups[4]*Z + 31)>>5;
// Ceil for values not divisible by 64
M = (lut_numCnInCnGroups[4]*Z + 63)>>5;
// Set the offset to each bit within a group in terms of 32 Byte
bitOffsetInGroup = (lut_numCnInCnGroups_BG1_R13[4]*NR_LDPC_ZMAX)>>5;
// Set pointers to start of group 4
//p_cnProcBuf = (__m256i*) &cnProcBuf [lut_startAddrCnGroups[1]];
//p_cnProcBufRes = (__m256i*) &cnProcBufRes[lut_startAddrCnGroups[1]];
//p_cnProcBuf = (__m512i*) &cnProcBuf [lut_startAddrCnGroups[1]];
//p_cnProcBufRes = (__m512i*) &cnProcBufRes[lut_startAddrCnGroups[1]];
// Loop over every BN
int iprime=0;
......@@ -377,32 +386,32 @@ void nrLDPC_cnProc_BG1_generator(uint16_t Z,int R)
// {
fprintf(fd," for (int i=0;i<%d;i++) {\n",M);
// Abs and sign of 32 CNs (first BN)
// ymm0 = p_cnProcBuf[lut_idxCnProcG3[j][0] + i];
fprintf(fd," ymm0 = ((__m256i*)cnProcBuf)[%d+i];\n",(lut_startAddrCnGroups[4]>>5)+lut_idxCnProcG7[j][0]);
// sgn = _mm256_sign_epi8(ones, ymm0);
fprintf(fd," sgn = _mm256_sign_epi8(ones, ymm0);\n");
// min = _mm256_abs_epi8(ymm0);
fprintf(fd," min = _mm256_abs_epi8(ymm0);\n");
// zmm0 = p_cnProcBuf[lut_idxCnProcG3[j][0] + i];
fprintf(fd," zmm0 = ((__m512i*)cnProcBuf)[%d+i];\n",(lut_startAddrCnGroups[4]>>5)+lut_idxCnProcG7[j][0]);
// sgn = _mm512_sign_epi16(ones, zmm0);
fprintf(fd," sgn = _mm512_sign_epi16(ones, zmm0);\n");
// min = _mm512_abs_epi8(zmm0);
fprintf(fd," min = _mm512_abs_epi8(zmm0);\n");
// Loop over BNs
for (k=1; k<6; k++)
{
fprintf(fd," ymm0 = ((__m256i*)cnProcBuf)[%d+i];\n",(lut_startAddrCnGroups[4]>>5)+lut_idxCnProcG7[j][k]);
fprintf(fd," zmm0 = ((__m512i*)cnProcBuf)[%d+i];\n",(lut_startAddrCnGroups[4]>>5)+lut_idxCnProcG7[j][k]);
// min = _mm256_min_epu8(min, _mm256_abs_epi8(ymm0));
fprintf(fd," min = _mm256_min_epu8(min, _mm256_abs_epi8(ymm0));\n");
// min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0));
fprintf(fd," min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0));\n");
// sgn = _mm256_sign_epi8(sgn, ymm0);
fprintf(fd," sgn = _mm256_sign_epi8(sgn, ymm0);\n");
// sgn = _mm512_sign_epi16(sgn, zmm0);
fprintf(fd," sgn = _mm512_sign_epi16(sgn, zmm0);\n");
}
// Store result
// min = _mm256_min_epu8(min, maxLLR); // 128 in epi8 is -127
fprintf(fd," min = _mm256_min_epu8(min, maxLLR);\n");
// *p_cnProcBufResBit = _mm256_sign_epi8(min, sgn);
// min = _mm512_min_epu8(min, maxLLR); // 128 in epi8 is -127
fprintf(fd," min = _mm512_min_epu8(min, maxLLR);\n");
// *p_cnProcBufResBit = _mm512_sign_epi16(min, sgn);
// p_cnProcBufResBit++;
fprintf(fd," ((__m256i*)cnProcBufRes)[%d+i] = _mm256_sign_epi8(min, sgn);\n",(lut_startAddrCnGroups[4]>>5)+(j*bitOffsetInGroup));
fprintf(fd," ((__m512i*)cnProcBufRes)[%d+i] = _mm512_sign_epi16(min, sgn);\n",(lut_startAddrCnGroups[4]>>5)+(j*bitOffsetInGroup));
fprintf(fd," }\n");
}
}
......@@ -411,28 +420,27 @@ void nrLDPC_cnProc_BG1_generator(uint16_t Z,int R)
// =====================================================================
// Process group with 8 BNs
fprintf(fd,"//Process group with 8 BNs\n");
// Offset is 2*384/32 = 24
const uint8_t lut_idxCnProcG8[8][7] = {{24,48,72,96,120,144,168}, {0,48,72,96,120,144,168},
{0,24,72,96,120,144,168}, {0,24,48,96,120,144,168},
{0,24,48,72,120,144,168}, {0,24,48,72,96,144,168},
{0,24,48,72,96,120,168}, {0,24,48,72,96,120,144}};
// Offset is 2*384/64 = 12
const uint8_t lut_idxCnProcG8[8][7] = {{12,24,36,48,56,72,84}, {0,24,36,48,56,72,84},
{0,12,36,48,56,72,84}, {0,12,24,48,56,72,84},
{0,12,24,36,56,72,84}, {0,12,24,36,48,72,84},
{0,12,24,36,48,56,84}, {0,12,24,36,48,120,72}};
if (lut_numCnInCnGroups[5] > 0)
{
// Number of groups of 32 CNs for parallel processing
// Ceil for values not divisible by 32
M = (lut_numCnInCnGroups[5]*Z + 31)>>5;
// Ceil for values not divisible by 64
M = (lut_numCnInCnGroups[5]*Z + 63)>>5;
// Set the offset to each bit within a group in terms of 32 Byte
bitOffsetInGroup = (lut_numCnInCnGroups_BG1_R13[5]*NR_LDPC_ZMAX)>>5;
// Set pointers to start of group 4
//p_cnProcBuf = (__m256i*) &cnProcBuf [lut_startAddrCnGroups[1]];
//p_cnProcBufRes = (__m256i*) &cnProcBufRes[lut_startAddrCnGroups[1]];
//p_cnProcBuf = (__m512i*) &cnProcBuf [lut_startAddrCnGroups[1]];
//p_cnProcBufRes = (__m512i*) &cnProcBufRes[lut_startAddrCnGroups[1]];
// Loop over every BN
int iprime=0;
......@@ -446,32 +454,32 @@ void nrLDPC_cnProc_BG1_generator(uint16_t Z,int R)
// {
fprintf(fd," for (int i=0;i<%d;i++) {\n",M);
// Abs and sign of 32 CNs (first BN)
// ymm0 = p_cnProcBuf[lut_idxCnProcG3[j][0] + i];
fprintf(fd," ymm0 = ((__m256i*)cnProcBuf)[%d+i];\n",(lut_startAddrCnGroups[5]>>5)+lut_idxCnProcG8[j][0]);
// sgn = _mm256_sign_epi8(ones, ymm0);
fprintf(fd," sgn = _mm256_sign_epi8(ones, ymm0);\n");
// min = _mm256_abs_epi8(ymm0);
fprintf(fd," min = _mm256_abs_epi8(ymm0);\n");
// zmm0 = p_cnProcBuf[lut_idxCnProcG3[j][0] + i];
fprintf(fd," zmm0 = ((__m512i*)cnProcBuf)[%d+i];\n",(lut_startAddrCnGroups[5]>>5)+lut_idxCnProcG8[j][0]);
// sgn = _mm512_sign_epi16(ones, zmm0);
fprintf(fd," sgn = _mm512_sign_epi16(ones, zmm0);\n");
// min = _mm512_abs_epi8(zmm0);
fprintf(fd," min = _mm512_abs_epi8(zmm0);\n");
// Loop over BNs
for (k=1; k<7; k++)
{
fprintf(fd," ymm0 = ((__m256i*)cnProcBuf)[%d+i];\n",(lut_startAddrCnGroups[5]>>5)+lut_idxCnProcG8[j][k]);
fprintf(fd," zmm0 = ((__m512i*)cnProcBuf)[%d+i];\n",(lut_startAddrCnGroups[5]>>5)+lut_idxCnProcG8[j][k]);
// min = _mm256_min_epu8(min, _mm256_abs_epi8(ymm0));
fprintf(fd," min = _mm256_min_epu8(min, _mm256_abs_epi8(ymm0));\n");
// min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0));
fprintf(fd," min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0));\n");
// sgn = _mm256_sign_epi8(sgn, ymm0);
fprintf(fd," sgn = _mm256_sign_epi8(sgn, ymm0);\n");
// sgn = _mm512_sign_epi16(sgn, zmm0);
fprintf(fd," sgn = _mm512_sign_epi16(sgn, zmm0);\n");
}
// Store result
// min = _mm256_min_epu8(min, maxLLR); // 128 in epi8 is -127
fprintf(fd," min = _mm256_min_epu8(min, maxLLR);\n");
// *p_cnProcBufResBit = _mm256_sign_epi8(min, sgn);
// min = _mm512_min_epu8(min, maxLLR); // 128 in epi8 is -127
fprintf(fd," min = _mm512_min_epu8(min, maxLLR);\n");
// *p_cnProcBufResBit = _mm512_sign_epi16(min, sgn);
// p_cnProcBufResBit++;
fprintf(fd," ((__m256i*)cnProcBufRes)[%d+i] = _mm256_sign_epi8(min, sgn);\n",(lut_startAddrCnGroups[5]>>5)+(j*bitOffsetInGroup));
fprintf(fd," ((__m512i*)cnProcBufRes)[%d+i] = _mm512_sign_epi16(min, sgn);\n",(lut_startAddrCnGroups[5]>>5)+(j*bitOffsetInGroup));
fprintf(fd," }\n");
}
}
......@@ -479,13 +487,12 @@ void nrLDPC_cnProc_BG1_generator(uint16_t Z,int R)
// =====================================================================
// Process group with 9 BNs
fprintf(fd,"//Process group with 9 BNs\n");
// Offset is 2*384/32 = 24
const uint8_t lut_idxCnProcG9[9][8] = {{24,48,72,96,120,144,168,192}, {0,48,72,96,120,144,168,192},
{0,24,72,96,120,144,168,192}, {0,24,48,96,120,144,168,192},
{0,24,48,72,120,144,168,192}, {0,24,48,72,96,144,168,192},
{0,24,48,72,96,120,168,192}, {0,24,48,72,96,120,144,192},
{0,24,48,72,96,120,144,168}};
// Offset is 2*384/64 = 12
const uint8_t lut_idxCnProcG9[9][8] = {{12,24,36,48,60,72,84,96}, {0,24,36,48,60,72,84,96},
{0,12,36,48,60,72,84,96}, {0,12,24,48,60,72,84,96},
{0,12,24,36,60,72,84,96}, {0,12,24,36,48,72,84,96},
{0,12,24,36,48,60,84,96}, {0,12,24,36,48,60,72,96},
{0,12,24,36,48,60,72,84}};
......@@ -493,16 +500,16 @@ void nrLDPC_cnProc_BG1_generator(uint16_t Z,int R)
if (lut_numCnInCnGroups[6] > 0)
{
// Number of groups of 32 CNs for parallel processing
// Ceil for values not divisible by 32
M = (lut_numCnInCnGroups[6]*Z + 31)>>5;
// Ceil for values not divisible by 64
M = (lut_numCnInCnGroups[6]*Z + 63)>>5;
// Set the offset to each bit within a group in terms of 32 Byte
bitOffsetInGroup = (lut_numCnInCnGroups_BG1_R13[6]*NR_LDPC_ZMAX)>>5;
// Set pointers to start of group 9
//p_cnProcBuf = (__m256i*) &cnProcBuf [lut_startAddrCnGroups[1]];
//p_cnProcBufRes = (__m256i*) &cnProcBufRes[lut_startAddrCnGroups[1]];
//p_cnProcBuf = (__m512i*) &cnProcBuf [lut_startAddrCnGroups[1]];
//p_cnProcBufRes = (__m512i*) &cnProcBufRes[lut_startAddrCnGroups[1]];
// Loop over every BN
int iprime=0;
......@@ -516,32 +523,32 @@ void nrLDPC_cnProc_BG1_generator(uint16_t Z,int R)
// {
fprintf(fd," for (int i=0;i<%d;i++) {\n",M);
// Abs and sign of 32 CNs (first BN)
// ymm0 = p_cnProcBuf[lut_idxCnProcG3[j][0] + i];
fprintf(fd," ymm0 = ((__m256i*)cnProcBuf)[%d+i];\n",(lut_startAddrCnGroups[6]>>5)+lut_idxCnProcG9[j][0]);
// sgn = _mm256_sign_epi8(ones, ymm0);
fprintf(fd," sgn = _mm256_sign_epi8(ones, ymm0);\n");
// min = _mm256_abs_epi8(ymm0);
fprintf(fd," min = _mm256_abs_epi8(ymm0);\n");
// zmm0 = p_cnProcBuf[lut_idxCnProcG3[j][0] + i];
fprintf(fd," zmm0 = ((__m512i*)cnProcBuf)[%d+i];\n",(lut_startAddrCnGroups[6]>>5)+lut_idxCnProcG9[j][0]);
// sgn = _mm512_sign_epi16(ones, zmm0);
fprintf(fd," sgn = _mm512_sign_epi16(ones, zmm0);\n");
// min = _mm512_abs_epi8(zmm0);
fprintf(fd," min = _mm512_abs_epi8(zmm0);\n");
// Loop over BNs
for (k=1; k<8; k++)
{
fprintf(fd," ymm0 = ((__m256i*)cnProcBuf)[%d+i];\n",(lut_startAddrCnGroups[6]>>5)+lut_idxCnProcG9[j][k]);
fprintf(fd," zmm0 = ((__m512i*)cnProcBuf)[%d+i];\n",(lut_startAddrCnGroups[6]>>5)+lut_idxCnProcG9[j][k]);
// min = _mm256_min_epu8(min, _mm256_abs_epi8(ymm0));
fprintf(fd," min = _mm256_min_epu8(min, _mm256_abs_epi8(ymm0));\n");
// min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0));
fprintf(fd," min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0));\n");
// sgn = _mm256_sign_epi8(sgn, ymm0);
fprintf(fd," sgn = _mm256_sign_epi8(sgn, ymm0);\n");
// sgn = _mm512_sign_epi16(sgn, zmm0);
fprintf(fd," sgn = _mm512_sign_epi16(sgn, zmm0);\n");
}
// Store result
// min = _mm256_min_epu8(min, maxLLR); // 128 in epi8 is -127
fprintf(fd," min = _mm256_min_epu8(min, maxLLR);\n");
// *p_cnProcBufResBit = _mm256_sign_epi8(min, sgn);
// min = _mm512_min_epu8(min, maxLLR); // 128 in epi8 is -127
fprintf(fd," min = _mm512_min_epu8(min, maxLLR);\n");
// *p_cnProcBufResBit = _mm512_sign_epi16(min, sgn);
// p_cnProcBufResBit++;
fprintf(fd," ((__m256i*)cnProcBufRes)[%d+i] = _mm256_sign_epi8(min, sgn);\n",(lut_startAddrCnGroups[6]>>5)+(j*bitOffsetInGroup));
fprintf(fd," ((__m512i*)cnProcBufRes)[%d+i] = _mm512_sign_epi16(min, sgn);\n",(lut_startAddrCnGroups[6]>>5)+(j*bitOffsetInGroup));
fprintf(fd," }\n");
}
}
......@@ -549,12 +556,12 @@ void nrLDPC_cnProc_BG1_generator(uint16_t Z,int R)
// =====================================================================
// Process group with 10 BNs
fprintf(fd,"//Process group with 10 BNs\n");
// Offset is 1*384/32 = 12
const uint8_t lut_idxCnProcG10[10][9] = {{12,24,36,48,60,72,84,96,108}, {0,24,36,48,60,72,84,96,108},
{0,12,36,48,60,72,84,96,108}, {0,12,24,48,60,72,84,96,108},
{0,12,24,36,60,72,84,96,108}, {0,12,24,36,48,72,84,96,108},
{0,12,24,36,48,60,84,96,108}, {0,12,24,36,48,60,72,96,108},
{0,12,24,36,48,60,72,84,108}, {0,12,24,36,48,60,72,84,96}};
// Offset is 1*384/64 = 6
const uint8_t lut_idxCnProcG10[10][9] = {{6,12,18,24,30,36,42,48,54}, {0,12,18,24,30,36,42,48,54},
{0,6,18,24,30,36,42,48,54}, {0,6,12,24,30,36,42,48,54},
{0,6,12,18,30,36,42,48,54}, {0,6,12,18,24,36,42,48,54},
{0,6,12,18,24,30,42,48,54}, {0,6,12,18,24,30,36,48,54},
{0,6,12,18,24,30,36,42,54}, {0,6,12,36,24,30,36,42,48}};
......@@ -563,16 +570,16 @@ void nrLDPC_cnProc_BG1_generator(uint16_t Z,int R)
if (lut_numCnInCnGroups[7] > 0)
{
// Number of groups of 32 CNs for parallel processing
// Ceil for values not divisible by 32
M = (lut_numCnInCnGroups[7]*Z + 31)>>5;
// Ceil for values not divisible by 64
M = (lut_numCnInCnGroups[7]*Z + 63)>>5;
// Set the offset to each bit within a group in terms of 32 Byte
bitOffsetInGroup = (lut_numCnInCnGroups_BG1_R13[7]*NR_LDPC_ZMAX)>>5;
// Set pointers to start of group 10
//p_cnProcBuf = (__m256i*) &cnProcBuf [lut_startAddrCnGroups[1]];
//p_cnProcBufRes = (__m256i*) &cnProcBufRes[lut_startAddrCnGroups[1]];
//p_cnProcBuf = (__m512i*) &cnProcBuf [lut_startAddrCnGroups[1]];
//p_cnProcBufRes = (__m512i*) &cnProcBufRes[lut_startAddrCnGroups[1]];
// Loop over every BN
int iprime=0;
......@@ -586,32 +593,32 @@ void nrLDPC_cnProc_BG1_generator(uint16_t Z,int R)
// {
fprintf(fd," for (int i=0;i<%d;i++) {\n",M);
// Abs and sign of 32 CNs (first BN)
// ymm0 = p_cnProcBuf[lut_idxCnProcG3[j][0] + i];
fprintf(fd," ymm0 = ((__m256i*)cnProcBuf)[%d+i];\n",(lut_startAddrCnGroups[7]>>5)+lut_idxCnProcG10[j][0]);
// sgn = _mm256_sign_epi8(ones, ymm0);
fprintf(fd," sgn = _mm256_sign_epi8(ones, ymm0);\n");
// min = _mm256_abs_epi8(ymm0);
fprintf(fd," min = _mm256_abs_epi8(ymm0);\n");
// zmm0 = p_cnProcBuf[lut_idxCnProcG3[j][0] + i];
fprintf(fd," zmm0 = ((__m512i*)cnProcBuf)[%d+i];\n",(lut_startAddrCnGroups[7]>>5)+lut_idxCnProcG10[j][0]);
// sgn = _mm512_sign_epi16(ones, zmm0);
fprintf(fd," sgn = _mm512_sign_epi16(ones, zmm0);\n");
// min = _mm512_abs_epi8(zmm0);
fprintf(fd," min = _mm512_abs_epi8(zmm0);\n");
// Loop over BNs
for (k=1; k<9; k++)
{
fprintf(fd," ymm0 = ((__m256i*)cnProcBuf)[%d+i];\n",(lut_startAddrCnGroups[7]>>5)+lut_idxCnProcG10[j][k]);
fprintf(fd," zmm0 = ((__m512i*)cnProcBuf)[%d+i];\n",(lut_startAddrCnGroups[7]>>5)+lut_idxCnProcG10[j][k]);
// min = _mm256_min_epu8(min, _mm256_abs_epi8(ymm0));
fprintf(fd," min = _mm256_min_epu8(min, _mm256_abs_epi8(ymm0));\n");
// min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0));
fprintf(fd," min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0));\n");
// sgn = _mm256_sign_epi8(sgn, ymm0);
fprintf(fd," sgn = _mm256_sign_epi8(sgn, ymm0);\n");
// sgn = _mm512_sign_epi16(sgn, zmm0);
fprintf(fd," sgn = _mm512_sign_epi16(sgn, zmm0);\n");
}
// Store result
// min = _mm256_min_epu8(min, maxLLR); // 128 in epi8 is -127
fprintf(fd," min = _mm256_min_epu8(min, maxLLR);\n");
// *p_cnProcBufResBit = _mm256_sign_epi8(min, sgn);
// min = _mm512_min_epu8(min, maxLLR); // 128 in epi8 is -127
fprintf(fd," min = _mm512_min_epu8(min, maxLLR);\n");
// *p_cnProcBufResBit = _mm512_sign_epi16(min, sgn);
// p_cnProcBufResBit++;
fprintf(fd," ((__m256i*)cnProcBufRes)[%d+i] = _mm256_sign_epi8(min, sgn);\n",(lut_startAddrCnGroups[7]>>5)+(j*bitOffsetInGroup));
fprintf(fd," ((__m512i*)cnProcBufRes)[%d+i] = _mm512_sign_epi16(min, sgn);\n",(lut_startAddrCnGroups[7]>>5)+(j*bitOffsetInGroup));
fprintf(fd," }\n");
}
}
......@@ -620,32 +627,32 @@ void nrLDPC_cnProc_BG1_generator(uint16_t Z,int R)
// =====================================================================
// Process group with 19 BNs
fprintf(fd,"//Process group with 19 BNs\n");
// Offset is 4*384/32 = 12
const uint16_t lut_idxCnProcG19[19][18] = {{48,96,144,192,240,288,336,384,432,480,528,576,624,672,720,768,816,864}, {0,96,144,192,240,288,336,384,432,480,528,576,624,672,720,768,816,864},
{0,48,144,192,240,288,336,384,432,480,528,576,624,672,720,768,816,864}, {0,48,96,192,240,288,336,384,432,480,528,576,624,672,720,768,816,864},
{0,48,96,144,240,288,336,384,432,480,528,576,624,672,720,768,816,864}, {0,48,96,144,192,288,336,384,432,480,528,576,624,672,720,768,816,864},
{0,48,96,144,192,240,336,384,432,480,528,576,624,672,720,768,816,864}, {0,48,96,144,192,240,288,384,432,480,528,576,624,672,720,768,816,864},
{0,48,96,144,192,240,288,336,432,480,528,576,624,672,720,768,816,864}, {0,48,96,144,192,240,288,336,384,480,528,576,624,672,720,768,816,864},
{0,48,96,144,192,240,288,336,384,432,528,576,624,672,720,768,816,864}, {0,48,96,144,192,240,288,336,384,432,480,576,624,672,720,768,816,864},
{0,48,96,144,192,240,288,336,384,432,480,528,624,672,720,768,816,864}, {0,48,96,144,192,240,288,336,384,432,480,528,576,672,720,768,816,864},
{0,48,96,144,192,240,288,336,384,432,480,528,576,624,720,768,816,864}, {0,48,96,144,192,240,288,336,384,432,480,528,576,624,672,768,816,864},
{0,48,96,144,192,240,288,336,384,432,480,528,576,624,672,720,816,864}, {0,48,96,144,192,240,288,336,384,432,480,528,576,624,672,720,768,864},
{0,48,96,144,192,240,288,336,384,432,480,528,576,624,672,720,768,816}};
// Offset is 4*384/64 = 24
const uint16_t lut_idxCnProcG19[19][18] = {{24,48,72,96,120,144,168,192,216,240,264,288,312,336,360,384,408,432}, {0,48,72,96,120,144,168,192,216,240,264,288,312,336,360,384,408,432},
{0,24,72,96,120,144,168,192,216,240,264,288,312,336,360,384,408,432}, {0,24,48,96,120,144,168,192,216,240,264,288,312,336,360,384,408,432},
{0,24,48,72,120,144,168,192,216,240,264,288,312,336,360,384,408,432}, {0,24,48,72,96,144,168,192,216,240,264,288,312,336,360,384,408,432},
{0,24,48,72,96,120,168,192,216,240,264,288,312,336,360,384,408,432}, {0,24,48,72,96,120,144,192,216,240,264,288,312,336,360,384,408,432},
{0,24,48,72,96,120,144,168,216,240,264,288,312,336,360,384,408,432}, {0,24,48,72,96,120,144,168,192,240,264,288,312,336,360,384,408,432},
{0,24,48,72,96,120,144,168,192,216,264,288,312,336,360,384,408,432}, {0,24,48,72,96,120,144,168,192,216,240,288,312,336,360,384,408,432},
{0,24,48,72,96,120,144,168,192,216,240,264,312,336,360,384,408,432}, {0,24,48,72,96,120,144,168,192,216,240,264,288,336,360,384,408,432},
{0,24,48,72,96,120,144,168,192,216,240,264,288,312,360,384,408,432}, {0,24,48,72,96,120,144,168,192,216,240,264,288,312,336,384,408,432},
{0,24,48,72,96,120,144,168,192,216,240,264,288,312,336,360,408,432}, {0,24,48,72,96,120,144,168,192,216,240,264,288,312,336,360,384,432},
{0,24,48,72,96,120,144,168,192,216,240,264,288,312,336,360,384,408}};
if (lut_numCnInCnGroups[8] > 0)
{
// Number of groups of 32 CNs for parallel processing
// Ceil for values not divisible by 32
M = (lut_numCnInCnGroups[8]*Z + 31)>>5;
// Ceil for values not divisible by 64
M = (lut_numCnInCnGroups[8]*Z + 63)>>5;
// Set the offset to each bit within a group in terms of 32 Byte
bitOffsetInGroup = (lut_numCnInCnGroups_BG1_R13[8]*NR_LDPC_ZMAX)>>5;
// Set pointers to start of group 19
//p_cnProcBuf = (__m256i*) &cnProcBuf [lut_startAddrCnGroups[1]];
//p_cnProcBufRes = (__m256i*) &cnProcBufRes[lut_startAddrCnGroups[1]];
//p_cnProcBuf = (__m512i*) &cnProcBuf [lut_startAddrCnGroups[1]];
//p_cnProcBufRes = (__m512i*) &cnProcBufRes[lut_startAddrCnGroups[1]];
// Loop over every BN
int iprime=0;
......@@ -659,32 +666,32 @@ void nrLDPC_cnProc_BG1_generator(uint16_t Z,int R)
// {
fprintf(fd," for (int i=0;i<%d;i++) {\n",M);
// Abs and sign of 32 CNs (first BN)
// ymm0 = p_cnProcBuf[lut_idxCnProcG3[j][0] + i];
fprintf(fd," ymm0 = ((__m256i*)cnProcBuf)[%d+i];\n",(lut_startAddrCnGroups[8]>>5)+lut_idxCnProcG19[j][0]);
// sgn = _mm256_sign_epi8(ones, ymm0);
fprintf(fd," sgn = _mm256_sign_epi8(ones, ymm0);\n");
// min = _mm256_abs_epi8(ymm0);
fprintf(fd," min = _mm256_abs_epi8(ymm0);\n");
// zmm0 = p_cnProcBuf[lut_idxCnProcG3[j][0] + i];
fprintf(fd," zmm0 = ((__m512i*)cnProcBuf)[%d+i];\n",(lut_startAddrCnGroups[8]>>5)+lut_idxCnProcG19[j][0]);
// sgn = _mm512_sign_epi16(ones, zmm0);
fprintf(fd," sgn = _mm512_sign_epi16(ones, zmm0);\n");
// min = _mm512_abs_epi8(zmm0);
fprintf(fd," min = _mm512_abs_epi8(zmm0);\n");
// Loop over BNs
for (k=1; k<18; k++)
{
fprintf(fd," ymm0 = ((__m256i*)cnProcBuf)[%d+i];\n",(lut_startAddrCnGroups[8]>>5)+lut_idxCnProcG19[j][k]);
fprintf(fd," zmm0 = ((__m512i*)cnProcBuf)[%d+i];\n",(lut_startAddrCnGroups[8]>>5)+lut_idxCnProcG19[j][k]);
// min = _mm256_min_epu8(min, _mm256_abs_epi8(ymm0));
fprintf(fd," min = _mm256_min_epu8(min, _mm256_abs_epi8(ymm0));\n");
// min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0));
fprintf(fd," min = _mm512_min_epu8(min, _mm512_abs_epi8(zmm0));\n");
// sgn = _mm256_sign_epi8(sgn, ymm0);
fprintf(fd," sgn = _mm256_sign_epi8(sgn, ymm0);\n");
// sgn = _mm512_sign_epi16(sgn, zmm0);
fprintf(fd," sgn = _mm512_sign_epi16(sgn, zmm0);\n");
}
// Store result
// min = _mm256_min_epu8(min, maxLLR); // 128 in epi8 is -127
fprintf(fd," min = _mm256_min_epu8(min, maxLLR);\n");
// *p_cnProcBufResBit = _mm256_sign_epi8(min, sgn);
// min = _mm512_min_epu8(min, maxLLR); // 128 in epi8 is -127
fprintf(fd," min = _mm512_min_epu8(min, maxLLR);\n");
// *p_cnProcBufResBit = _mm512_sign_epi16(min, sgn);
// p_cnProcBufResBit++;
fprintf(fd," ((__m256i*)cnProcBufRes)[%d+i] = _mm256_sign_epi8(min, sgn);\n",(lut_startAddrCnGroups[8]>>5)+(j*bitOffsetInGroup));
fprintf(fd," ((__m512i*)cnProcBufRes)[%d+i] = _mm512_sign_epi16(min, sgn);\n",(lut_startAddrCnGroups[8]>>5)+(j*bitOffsetInGroup));
fprintf(fd," }\n");
}
}
......
This source diff could not be displayed because it is too large. You can view the blob instead.
......@@ -191,11 +191,11 @@ static const uint16_t lut_startAddrBnGroupsLlr_BG2_R13[NR_LDPC_NUM_BN_GROUPS_BG2
/** Start address for every BN group within the LLR processing buffer for BG2 rate = 2/3 */
static const uint16_t lut_startAddrBnGroupsLlr_BG2_R23[NR_LDPC_NUM_BN_GROUPS_BG2_R23] = {0, 1152, 2304, 4224, 5376, 6144};
/** Vector of 32 '1' in int8 for application with AVX2 */
static const int8_t ones256_epi8[32] __attribute__ ((aligned(32))) = {1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1};
/** Vector of 32 '0' in int8 for application with AVX2 */
static const int8_t zeros256_epi8[32] __attribute__ ((aligned(32))) = {0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0};
/** Vector of 32 '127' in int8 for application with AVX2 */
static const int8_t maxLLR256_epi8[32] __attribute__ ((aligned(32))) = {127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127};
/** Vector of 64 '1' in int8 for application with AVX512 */
static const int8_t ones512_epi8[64] __attribute__ ((aligned(64))) = {1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1};
/** Vector of 64 '0' in int8 for application with AVX512 */
static const int8_t zeros512_epi8[64] __attribute__ ((aligned(64))) = {0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0};
/** Vector of 64 '127' in int8 for application with AVX512 */
static const int8_t maxLLR512_epi8[64] __attribute__ ((aligned(64))) = {127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127};
#endif
Markdown is supported
0%
or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment