Commit 3c013cf1 authored by Hongzhi Wang's avatar Hongzhi Wang Committed by laurent

minor changes to build/run on x86

parent 2c666a8e
......@@ -35,7 +35,7 @@
#include "nrLDPC_bnProc.h"
#define UNROLL_CN_PROC 1
#define UNROLL_BN_PROC 1
//#define UNROLL_BN_PROC_PC 1
#define UNROLL_BN_PROC_PC 1
#define UNROLL_BN2CN_PROC 1
/*----------------------------------------------------------------------
| cn processing files -->AVX512
......
......@@ -80,17 +80,66 @@ void nrLDPC_bnProcPc_BG1_generator_AVX2(const char *dir, int R)
uint32_t cnOffsetInGroup;
uint8_t idxBnGroup = 0;
fprintf(fd," __m256i ymm0, ymm1, ymmRes0, ymmRes1; \n");
fprintf(fd," simde__m256i* p_bnProcBuf; \n");
fprintf(fd," simde__m256i* p_llrProcBuf;\n");
fprintf(fd," simde__m256i* p_llrRes; \n");
// fprintf(fd," simde__m256i* p_bnProcBufRes; \n");
// fprintf(fd," simde__m256i* p_llrProcBuf256; \n");
fprintf(fd," __m128i* p_bnProcBuf; \n");
fprintf(fd," __m128i* p_llrProcBuf;\n");
fprintf(fd," __m256i* p_llrRes; \n");
// fprintf(fd," __m256i* p_bnProcBufRes; \n");
// fprintf(fd," __m256i* p_llrProcBuf256; \n");
fprintf(fd," uint32_t M ;\n");
fprintf(fd, "// Process group with 1 CNs \n");
/*
// Process group with 1 CNs
// if (lut_numBnInBnGroups[0] > 0)
// {
// If elements in group move to next address
// idxBnGroup++;
// Number of groups of 32 BNs for parallel processing
fprintf(fd," M = (%d*Z + 31)>>5;\n",lut_numBnInBnGroups[0] );
// Set the offset to each CN within a group in terms of 16 Byte
// cnOffsetInGroup = (lut_numBnInBnGroups[0]*NR_LDPC_ZMAX)>>4;
// Set pointers to start of group 2
fprintf(fd," p_bnProcBuf = (__m128i*) &bnProcBuf [%d];\n",lut_startAddrBnGroups[idxBnGroup]);
// fprintf(fd," p_bnProcBufRes = (__m256i*) &bnProcBufRes [%d];\n",lut_startAddrBnGroups[idxBnGroup]);
fprintf(fd," p_llrProcBuf = (__m128i*) &llrProcBuf [%d];\n",lut_startAddrBnGroupsLlr[idxBnGroup]);
// fprintf(fd," p_llrProcBuf256 = (__m256i*) &llrProcBuf [%d];\n",lut_startAddrBnGroupsLlr[idxBnGroup]);
fprintf(fd," p_llrRes = (__m256i*) &llrRes [%d];\n",lut_startAddrBnGroupsLlr[idxBnGroup]);
// Loop over BNs
fprintf(fd," for (int i=0,j=0;i<M;i++,j+=2) {\n");
fprintf(fd," p_bnProcBufRes[i] = p_llrProcBuf256[i];\n");
// First 16 LLRs of first CN
fprintf(fd," ymm0 = simde_mm256_cvtepi8_epi16(p_bnProcBuf [j]);\n");
fprintf(fd," ymm1 = simde_mm256_cvtepi8_epi16(p_llrProcBuf[j]);\n");
fprintf(fd," ymmRes0 = simde_mm256_adds_epi16(ymm0, ymm1);\n");
// Second 16 LLRs of first CN
fprintf(fd," ymm0 = simde_mm256_cvtepi8_epi16(p_bnProcBuf[j + 1 ]);\n");
fprintf(fd," ymm1 = simde_mm256_cvtepi8_epi16(p_llrProcBuf[j + 1 ]);\n");
fprintf(fd," ymmRes1 = simde_mm256_adds_epi16(ymm0, ymm1);\n");
// Pack results back to epi8
fprintf(fd," ymm0 = simde_mm256_packs_epi16(ymmRes0, ymmRes1);\n");
// ymm0 = [ymmRes1[255:128] ymmRes0[255:128] ymmRes1[127:0] ymmRes0[127:0]]
// p_llrRes = [ymmRes1[255:128] ymmRes1[127:0] ymmRes0[255:128] ymmRes0[127:0]]
fprintf(fd," p_llrRes[i] = simde_mm256_permute4x64_epi64(ymm0, 0xD8);\n");
fprintf(fd,"}\n");
//}
*/
// =====================================================================
// Process group with 2 CNs
......@@ -111,24 +160,39 @@ fprintf(fd, "// Process group with 2 CNs \n");
cnOffsetInGroup = (lut_numBnInBnGroups[1]*NR_LDPC_ZMAX)>>4;
// Set pointers to start of group 2
fprintf(fd," p_bnProcBuf = (simde__m256i*) &bnProcBuf [%d];\n",lut_startAddrBnGroups[idxBnGroup]);
fprintf(fd," p_llrProcBuf = (simde__m256i*) &llrProcBuf [%d];\n",lut_startAddrBnGroupsLlr[idxBnGroup]);
fprintf(fd," p_llrRes = (simde__m256i*) &llrRes [%d];\n",lut_startAddrBnGroupsLlr[idxBnGroup]);
fprintf(fd," p_bnProcBuf = (__m128i*) &bnProcBuf [%d];\n",lut_startAddrBnGroups[idxBnGroup]);
fprintf(fd," p_llrProcBuf = (__m128i*) &llrProcBuf [%d];\n",lut_startAddrBnGroupsLlr[idxBnGroup]);
fprintf(fd," p_llrRes = (__m256i*) &llrRes [%d];\n",lut_startAddrBnGroupsLlr[idxBnGroup]);
// Loop over BNs
fprintf(fd," for (int i=0;i<M;i++) {\n");
fprintf(fd," for (int i=0,j=0;i<M;i++,j+=2) {\n");
// First 16 LLRs of first CN
fprintf(fd," p_llrRes[i] = p_bnProcBuf [i];\n");
fprintf(fd," ymmRes0 = simde_mm256_cvtepi8_epi16(p_bnProcBuf [j]);\n");
fprintf(fd," ymmRes1 = simde_mm256_cvtepi8_epi16(p_bnProcBuf[j + 1]);\n");
// Loop over CNs
for (k=1; k<2; k++)
{
fprintf(fd," p_llrRes[i] = simde_mm256_adds_epi8(p_llrRes[i], p_bnProcBuf[%d + i]);\n",k*cnOffsetInGroup);
fprintf(fd," ymm0 = simde_mm256_cvtepi8_epi16(p_bnProcBuf[%d + j]);\n", k*cnOffsetInGroup);
fprintf(fd," ymmRes0 = simde_mm256_adds_epi16(ymmRes0, ymm0);\n");
fprintf(fd," ymm1 = simde_mm256_cvtepi8_epi16(p_bnProcBuf[%d + j +1]);\n", k*cnOffsetInGroup);
fprintf(fd, " ymmRes1 = simde_mm256_adds_epi16(ymmRes1, ymm1); \n");
}
// Add LLR from receiver input
fprintf(fd," p_llrRes[i] = simde_mm256_adds_epi8(p_llrRes[i], p_llrProcBuf[i]);\n");
fprintf(fd," ymm0 = simde_mm256_cvtepi8_epi16(p_llrProcBuf[j]);\n");
fprintf(fd," ymmRes0 = simde_mm256_adds_epi16(ymmRes0, ymm0);\n");
fprintf(fd," ymm1 = simde_mm256_cvtepi8_epi16(p_llrProcBuf[j +1 ]);\n");
fprintf(fd," ymmRes1 = simde_mm256_adds_epi16(ymmRes1, ymm1);\n");
// Pack results back to epi8
fprintf(fd," ymm0 = simde_mm256_packs_epi16(ymmRes0, ymmRes1);\n");
// ymm0 = [ymmRes1[255:128] ymmRes0[255:128] ymmRes1[127:0] ymmRes0[127:0]]
// p_llrRes = [ymmRes1[255:128] ymmRes1[127:0] ymmRes0[255:128] ymmRes0[127:0]]
fprintf(fd," p_llrRes[i] = simde_mm256_permute4x64_epi64(ymm0, 0xD8);\n");
fprintf(fd,"}\n");
......@@ -154,27 +218,39 @@ fprintf(fd, "// Process group with 3 CNs \n");
cnOffsetInGroup = (lut_numBnInBnGroups[2]*NR_LDPC_ZMAX)>>4;
// Set pointers to start of group 2
fprintf(fd," p_bnProcBuf = (simde__m256i*) &bnProcBuf [%d];\n",lut_startAddrBnGroups[idxBnGroup]);
fprintf(fd," p_llrProcBuf = (simde__m256i*) &llrProcBuf [%d];\n",lut_startAddrBnGroupsLlr[idxBnGroup]);
fprintf(fd," p_llrRes = (simde__m256i*) &llrRes [%d];\n",lut_startAddrBnGroupsLlr[idxBnGroup]);
fprintf(fd," p_bnProcBuf = (__m128i*) &bnProcBuf [%d];\n",lut_startAddrBnGroups[idxBnGroup]);
fprintf(fd," p_llrProcBuf = (__m128i*) &llrProcBuf [%d];\n",lut_startAddrBnGroupsLlr[idxBnGroup]);
fprintf(fd," p_llrRes = (__m256i*) &llrRes [%d];\n",lut_startAddrBnGroupsLlr[idxBnGroup]);
// Loop over BNs
fprintf(fd," for (int i=0;i<M;i++) {\n");
fprintf(fd," for (int i=0,j=0;i<M;i++,j+=2) {\n");
// First 16 LLRs of first CN
fprintf(fd," p_llrRes[i] = p_bnProcBuf [i];\n");
fprintf(fd," ymmRes0 = simde_mm256_cvtepi8_epi16(p_bnProcBuf [j]);\n");
fprintf(fd," ymmRes1 = simde_mm256_cvtepi8_epi16(p_bnProcBuf [j +1]);\n");
// Loop over CNs
for (k=1; k<3; k++)
{
fprintf(fd," p_llrRes[i] = simde_mm256_adds_epi8(p_llrRes[i], p_bnProcBuf[%d + i]);\n",k*cnOffsetInGroup);
fprintf(fd," ymm0 = simde_mm256_cvtepi8_epi16(p_bnProcBuf[%d + j]);\n", k*cnOffsetInGroup);
fprintf(fd," ymmRes0 = simde_mm256_adds_epi16(ymmRes0, ymm0);\n");
fprintf(fd," ymm1 = simde_mm256_cvtepi8_epi16(p_bnProcBuf[%d + j +1]);\n", k*cnOffsetInGroup);
fprintf(fd," ymmRes1 = simde_mm256_adds_epi16(ymmRes1, ymm1); \n");
}
// Add LLR from receiver input
fprintf(fd," p_llrRes[i] = simde_mm256_adds_epi8(p_llrRes[i], p_llrProcBuf[i]);\n");
fprintf(fd," ymm0 = simde_mm256_cvtepi8_epi16(p_llrProcBuf[j]);\n");
fprintf(fd," ymmRes0 = simde_mm256_adds_epi16(ymmRes0, ymm0);\n");
fprintf(fd," ymm1 = simde_mm256_cvtepi8_epi16(p_llrProcBuf[j +1 ]);\n");
fprintf(fd," ymmRes1 = simde_mm256_adds_epi16(ymmRes1, ymm1);\n");
// Pack results back to epi8
fprintf(fd," ymm0 = simde_mm256_packs_epi16(ymmRes0, ymmRes1);\n");
// ymm0 = [ymmRes1[255:128] ymmRes0[255:128] ymmRes1[127:0] ymmRes0[127:0]]
fprintf(fd," p_llrRes[i] = simde_mm256_permute4x64_epi64(ymm0, 0xD8);\n");
fprintf(fd,"}\n");
}
......@@ -198,24 +274,39 @@ fprintf(fd, "// Process group with 4 CNs \n");
cnOffsetInGroup = (lut_numBnInBnGroups[3]*NR_LDPC_ZMAX)>>4;
// Set pointers to start of group 2
fprintf(fd," p_bnProcBuf = (simde__m256i*) &bnProcBuf [%d];\n",lut_startAddrBnGroups[idxBnGroup]);
fprintf(fd," p_llrProcBuf = (simde__m256i*) &llrProcBuf [%d];\n",lut_startAddrBnGroupsLlr[idxBnGroup]);
fprintf(fd," p_llrRes = (simde__m256i*) &llrRes [%d];\n",lut_startAddrBnGroupsLlr[idxBnGroup]);
fprintf(fd," p_bnProcBuf = (__m128i*) &bnProcBuf [%d];\n",lut_startAddrBnGroups[idxBnGroup]);
fprintf(fd," p_llrProcBuf = (__m128i*) &llrProcBuf [%d];\n",lut_startAddrBnGroupsLlr[idxBnGroup]);
fprintf(fd," p_llrRes = (__m256i*) &llrRes [%d];\n",lut_startAddrBnGroupsLlr[idxBnGroup]);
// Loop over BNs
fprintf(fd," for (int i=0;i<M;i++) {\n");
fprintf(fd," for (int i=0,j=0;i<M;i++,j+=2) {\n");
// First 16 LLRs of first CN
fprintf(fd," p_llrRes[i] = p_bnProcBuf [i];\n");
fprintf(fd," ymmRes0 = simde_mm256_cvtepi8_epi16(p_bnProcBuf [j]);\n");
fprintf(fd," ymmRes1 = simde_mm256_cvtepi8_epi16(p_bnProcBuf [j +1]);\n");
// Loop over CNs
for (k=1; k<4; k++)
{
fprintf(fd," p_llrRes[i] = simde_mm256_adds_epi8(p_llrRes[i], p_bnProcBuf[%d + i]);\n",k*cnOffsetInGroup);
fprintf(fd," ymm0 = simde_mm256_cvtepi8_epi16(p_bnProcBuf[%d + j]);\n", k*cnOffsetInGroup);
fprintf(fd," ymmRes0 = simde_mm256_adds_epi16(ymmRes0, ymm0);\n");
fprintf(fd," ymm1 = simde_mm256_cvtepi8_epi16(p_bnProcBuf[%d + j +1]);\n", k*cnOffsetInGroup);
fprintf(fd, " ymmRes1 = simde_mm256_adds_epi16(ymmRes1, ymm1); \n");
}
// Add LLR from receiver input
fprintf(fd," p_llrRes[i] = simde_mm256_adds_epi8(p_llrRes[i], p_llrProcBuf[i]);\n");
fprintf(fd," ymm0 = simde_mm256_cvtepi8_epi16(p_llrProcBuf[j]);\n");
fprintf(fd," ymmRes0 = simde_mm256_adds_epi16(ymmRes0, ymm0);\n");
fprintf(fd," ymm1 = simde_mm256_cvtepi8_epi16(p_llrProcBuf[j +1 ]);\n");
fprintf(fd," ymmRes1 = simde_mm256_adds_epi16(ymmRes1, ymm1);\n");
// Pack results back to epi8
fprintf(fd," ymm0 = simde_mm256_packs_epi16(ymmRes0, ymmRes1);\n");
// ymm0 = [ymmRes1[255:128] ymmRes0[255:128] ymmRes1[127:0] ymmRes0[127:0]]
// p_llrRes = [ymmRes1[255:128] ymmRes1[127:0] ymmRes0[255:128] ymmRes0[127:0]]
fprintf(fd," p_llrRes[i] = simde_mm256_permute4x64_epi64(ymm0, 0xD8);\n");
fprintf(fd,"}\n");
}
......@@ -224,7 +315,7 @@ fprintf(fd, "// Process group with 4 CNs \n");
// =====================================================================
// Process group with 5 CNs
fprintf(fd, "// Process group with 5 CNs \n");
fprintf(fd, "// Process group with 5 CNs \n");
// Process group with 5 CNs
......@@ -240,24 +331,38 @@ fprintf(fd, "// Process group with 4 CNs \n");
cnOffsetInGroup = (lut_numBnInBnGroups[4]*NR_LDPC_ZMAX)>>4;
// Set pointers to start of group 2
fprintf(fd," p_bnProcBuf = (simde__m256i*) &bnProcBuf [%d];\n",lut_startAddrBnGroups[idxBnGroup]);
fprintf(fd," p_llrProcBuf = (simde__m256i*) &llrProcBuf [%d];\n",lut_startAddrBnGroupsLlr[idxBnGroup]);
fprintf(fd," p_llrRes = (simde__m256i*) &llrRes [%d];\n",lut_startAddrBnGroupsLlr[idxBnGroup]);
fprintf(fd," p_bnProcBuf = (__m128i*) &bnProcBuf [%d];\n",lut_startAddrBnGroups[idxBnGroup]);
fprintf(fd," p_llrProcBuf = (__m128i*) &llrProcBuf [%d];\n",lut_startAddrBnGroupsLlr[idxBnGroup]);
fprintf(fd," p_llrRes = (__m256i*) &llrRes [%d];\n",lut_startAddrBnGroupsLlr[idxBnGroup]);
// Loop over BNs
fprintf(fd," for (int i=0;i<M;i++) {\n");
fprintf(fd," for (int i=0,j=0;i<M;i++,j+=2) {\n");
// First 16 LLRs of first CN
fprintf(fd," p_llrRes[i] = p_bnProcBuf [i];\n");
fprintf(fd," ymmRes0 = simde_mm256_cvtepi8_epi16(p_bnProcBuf [j]);\n");
fprintf(fd," ymmRes1 = simde_mm256_cvtepi8_epi16(p_bnProcBuf [j +1]);\n");
// Loop over CNs
for (k=1; k<5; k++)
{
fprintf(fd," p_llrRes[i] = simde_mm256_adds_epi8(p_llrRes[i], p_bnProcBuf[%d + i]);\n",k*cnOffsetInGroup);
fprintf(fd," ymm0 = simde_mm256_cvtepi8_epi16(p_bnProcBuf[%d + j]);\n", k*cnOffsetInGroup);
fprintf(fd," ymmRes0 = simde_mm256_adds_epi16(ymmRes0, ymm0);\n");
fprintf(fd," ymm1 = simde_mm256_cvtepi8_epi16(p_bnProcBuf[%d + j +1]);\n", k*cnOffsetInGroup);
fprintf(fd, " ymmRes1 = simde_mm256_adds_epi16(ymmRes1, ymm1); \n");
}
// Add LLR from receiver input
fprintf(fd," p_llrRes[i] = simde_mm256_adds_epi8(p_llrRes[i], p_llrProcBuf[i]);\n");
fprintf(fd," ymm0 = simde_mm256_cvtepi8_epi16(p_llrProcBuf[j]);\n");
fprintf(fd," ymmRes0 = simde_mm256_adds_epi16(ymmRes0, ymm0);\n");
fprintf(fd," ymm1 = simde_mm256_cvtepi8_epi16(p_llrProcBuf[j +1 ]);\n");
fprintf(fd," ymmRes1 = simde_mm256_adds_epi16(ymmRes1, ymm1);\n");
// Pack results back to epi8
fprintf(fd," ymm0 = simde_mm256_packs_epi16(ymmRes0, ymmRes1);\n");
// ymm0 = [ymmRes1[255:128] ymmRes0[255:128] ymmRes1[127:0] ymmRes0[127:0]]
// p_llrRes = [ymmRes1[255:128] ymmRes1[127:0] ymmRes0[255:128] ymmRes0[127:0]]
fprintf(fd," p_llrRes[i] = simde_mm256_permute4x64_epi64(ymm0, 0xD8);\n");
fprintf(fd,"}\n");
}
......@@ -283,24 +388,38 @@ fprintf(fd, "// Process group with 6 CNs \n");
cnOffsetInGroup = (lut_numBnInBnGroups[5]*NR_LDPC_ZMAX)>>4;
// Set pointers to start of group 2
fprintf(fd," p_bnProcBuf = (simde__m256i*) &bnProcBuf [%d];\n",lut_startAddrBnGroups[idxBnGroup]);
fprintf(fd," p_llrProcBuf = (simde__m256i*) &llrProcBuf [%d];\n",lut_startAddrBnGroupsLlr[idxBnGroup]);
fprintf(fd," p_llrRes = (simde__m256i*) &llrRes [%d];\n",lut_startAddrBnGroupsLlr[idxBnGroup]);
fprintf(fd," p_bnProcBuf = (__m128i*) &bnProcBuf [%d];\n",lut_startAddrBnGroups[idxBnGroup]);
fprintf(fd," p_llrProcBuf = (__m128i*) &llrProcBuf [%d];\n",lut_startAddrBnGroupsLlr[idxBnGroup]);
fprintf(fd," p_llrRes = (__m256i*) &llrRes [%d];\n",lut_startAddrBnGroupsLlr[idxBnGroup]);
// Loop over BNs
fprintf(fd," for (int i=0;i<M;i++) {\n");
fprintf(fd," for (int i=0,j=0;i<M;i++,j+=2) {\n");
// First 16 LLRs of first CN
fprintf(fd," p_llrRes[i] = p_bnProcBuf [i];\n");
fprintf(fd," ymmRes0 = simde_mm256_cvtepi8_epi16(p_bnProcBuf [j]);\n");
fprintf(fd," ymmRes1 = simde_mm256_cvtepi8_epi16(p_bnProcBuf [j +1]);\n");
// Loop over CNs
for (k=1; k<6; k++)
{
fprintf(fd," p_llrRes[i] = simde_mm256_adds_epi8(p_llrRes[i], p_bnProcBuf[%d + i]);\n",k*cnOffsetInGroup);
fprintf(fd," ymm0 = simde_mm256_cvtepi8_epi16(p_bnProcBuf[%d + j]);\n", k*cnOffsetInGroup);
fprintf(fd," ymmRes0 = simde_mm256_adds_epi16(ymmRes0, ymm0);\n");
fprintf(fd," ymm1 = simde_mm256_cvtepi8_epi16(p_bnProcBuf[%d + j +1]);\n", k*cnOffsetInGroup);
fprintf(fd, " ymmRes1 = simde_mm256_adds_epi16(ymmRes1, ymm1); \n");
}
// Add LLR from receiver input
fprintf(fd," p_llrRes[i] = simde_mm256_adds_epi8(p_llrRes[i], p_llrProcBuf[i]);\n");
fprintf(fd," ymm0 = simde_mm256_cvtepi8_epi16(p_llrProcBuf[j]);\n");
fprintf(fd," ymmRes0 = simde_mm256_adds_epi16(ymmRes0, ymm0);\n");
fprintf(fd," ymm1 = simde_mm256_cvtepi8_epi16(p_llrProcBuf[j +1 ]);\n");
fprintf(fd," ymmRes1 = simde_mm256_adds_epi16(ymmRes1, ymm1);\n");
// Pack results back to epi8
fprintf(fd," ymm0 = simde_mm256_packs_epi16(ymmRes0, ymmRes1);\n");
// ymm0 = [ymmRes1[255:128] ymmRes0[255:128] ymmRes1[127:0] ymmRes0[127:0]]
// p_llrRes = [ymmRes1[255:128] ymmRes1[127:0] ymmRes0[255:128] ymmRes0[127:0]]
fprintf(fd," p_llrRes[i] = simde_mm256_permute4x64_epi64(ymm0, 0xD8);\n");
fprintf(fd,"}\n");
}
......@@ -325,27 +444,44 @@ fprintf(fd, "// Process group with 7 CNs \n");
cnOffsetInGroup = (lut_numBnInBnGroups[6]*NR_LDPC_ZMAX)>>4;
// Set pointers to start of group 2
fprintf(fd," p_bnProcBuf = (simde__m256i*) &bnProcBuf [%d];\n",lut_startAddrBnGroups[idxBnGroup]);
fprintf(fd," p_llrProcBuf = (simde__m256i*) &llrProcBuf [%d];\n",lut_startAddrBnGroupsLlr[idxBnGroup]);
fprintf(fd," p_llrRes = (simde__m256i*) &llrRes [%d];\n",lut_startAddrBnGroupsLlr[idxBnGroup]);
fprintf(fd," p_bnProcBuf = (__m128i*) &bnProcBuf [%d];\n",lut_startAddrBnGroups[idxBnGroup]);
fprintf(fd," p_llrProcBuf = (__m128i*) &llrProcBuf [%d];\n",lut_startAddrBnGroupsLlr[idxBnGroup]);
fprintf(fd," p_llrRes = (__m256i*) &llrRes [%d];\n",lut_startAddrBnGroupsLlr[idxBnGroup]);
// Loop over BNs
fprintf(fd," for (int i=0;i<M;i++) {\n");
fprintf(fd," for (int i=0,j=0;i<M;i++,j+=2) {\n");
// First 16 LLRs of first CN
fprintf(fd," p_llrRes[i] = p_bnProcBuf [i];\n");
fprintf(fd," ymmRes0 = simde_mm256_cvtepi8_epi16(p_bnProcBuf [j]);\n");
fprintf(fd," ymmRes1 = simde_mm256_cvtepi8_epi16(p_bnProcBuf [j +1]);\n");
// Loop over CNs
for (k=1; k<7; k++)
{
fprintf(fd," p_llrRes[i] = simde_mm256_adds_epi8(p_llrRes[i], p_bnProcBuf[%d + i]);\n",k*cnOffsetInGroup);
fprintf(fd," ymm0 = simde_mm256_cvtepi8_epi16(p_bnProcBuf[%d + j]);\n", k*cnOffsetInGroup);
fprintf(fd," ymmRes0 = simde_mm256_adds_epi16(ymmRes0, ymm0);\n");
fprintf(fd," ymm1 = simde_mm256_cvtepi8_epi16(p_bnProcBuf[%d + j +1]);\n", k*cnOffsetInGroup);
fprintf(fd, " ymmRes1 = simde_mm256_adds_epi16(ymmRes1, ymm1); \n");
}
// Add LLR from receiver input
fprintf(fd," p_llrRes[i] = simde_mm256_adds_epi8(p_llrRes[i], p_llrProcBuf[i]);\n");
fprintf(fd," ymm0 = simde_mm256_cvtepi8_epi16(p_llrProcBuf[j]);\n");
fprintf(fd," ymmRes0 = simde_mm256_adds_epi16(ymmRes0, ymm0);\n");
fprintf(fd," ymm1 = simde_mm256_cvtepi8_epi16(p_llrProcBuf[j +1 ]);\n");
fprintf(fd," ymmRes1 = simde_mm256_adds_epi16(ymmRes1, ymm1);\n");
// Pack results back to epi8
fprintf(fd," ymm0 = simde_mm256_packs_epi16(ymmRes0, ymmRes1);\n");
// ymm0 = [ymmRes1[255:128] ymmRes0[255:128] ymmRes1[127:0] ymmRes0[127:0]]
// p_llrRes = [ymmRes1[255:128] ymmRes1[127:0] ymmRes0[255:128] ymmRes0[127:0]]
//fprintf(fd," (__m256i*) &llrRes[%d + i] = simde_mm256_permute4x64_epi64(ymm0, 0xD8);\n",lut_startAddrBnGroupsLlr[idxBnGroup]>>5 );
fprintf(fd," p_llrRes[i] = simde_mm256_permute4x64_epi64(ymm0, 0xD8);\n");
fprintf(fd,"}\n");
}
// =====================================================================
// Process group with 8 CNs
......@@ -365,27 +501,42 @@ fprintf(fd, "// Process group with 8 CNs \n");
cnOffsetInGroup = (lut_numBnInBnGroups[7]*NR_LDPC_ZMAX)>>4;
// Set pointers to start of group 2
fprintf(fd," p_bnProcBuf = (simde__m256i*) &bnProcBuf [%d];\n",lut_startAddrBnGroups[idxBnGroup]);
fprintf(fd," p_llrProcBuf = (simde__m256i*) &llrProcBuf [%d];\n",lut_startAddrBnGroupsLlr[idxBnGroup]);
fprintf(fd," p_llrRes = (simde__m256i*) &llrRes [%d];\n",lut_startAddrBnGroupsLlr[idxBnGroup]);
fprintf(fd," p_bnProcBuf = (__m128i*) &bnProcBuf [%d];\n",lut_startAddrBnGroups[idxBnGroup]);
fprintf(fd," p_llrProcBuf = (__m128i*) &llrProcBuf [%d];\n",lut_startAddrBnGroupsLlr[idxBnGroup]);
fprintf(fd," p_llrRes = (__m256i*) &llrRes [%d];\n",lut_startAddrBnGroupsLlr[idxBnGroup]);
// Loop over BNs
fprintf(fd," for (int i=0;i<M;i++) {\n");
fprintf(fd," for (int i=0,j=0;i<M;i++,j+=2) {\n");
// First 16 LLRs of first CN
fprintf(fd," p_llrRes[i] = p_bnProcBuf [i];\n");
fprintf(fd," ymmRes0 = simde_mm256_cvtepi8_epi16(p_bnProcBuf [j]);\n");
fprintf(fd," ymmRes1 = simde_mm256_cvtepi8_epi16(p_bnProcBuf [j +1]);\n");
// Loop over CNs
for (k=1; k<8; k++)
{
fprintf(fd," p_llrRes[i] = simde_mm256_adds_epi8(p_llrRes[i], p_bnProcBuf[%d + i]);\n",k*cnOffsetInGroup);
fprintf(fd," ymm0 = simde_mm256_cvtepi8_epi16(p_bnProcBuf[%d + j]);\n", k*cnOffsetInGroup);
fprintf(fd," ymmRes0 = simde_mm256_adds_epi16(ymmRes0, ymm0);\n");
fprintf(fd," ymm1 = simde_mm256_cvtepi8_epi16(p_bnProcBuf[%d + j +1]);\n", k*cnOffsetInGroup);
fprintf(fd, " ymmRes1 = simde_mm256_adds_epi16(ymmRes1, ymm1); \n");
}
// Add LLR from receiver input
fprintf(fd," p_llrRes[i] = simde_mm256_adds_epi8(p_llrRes[i], p_llrProcBuf[i]);\n");
fprintf(fd," ymm0 = simde_mm256_cvtepi8_epi16(p_llrProcBuf[j]);\n");
fprintf(fd," ymmRes0 = simde_mm256_adds_epi16(ymmRes0, ymm0);\n");
fprintf(fd," ymm1 = simde_mm256_cvtepi8_epi16(p_llrProcBuf[j +1 ]);\n");
fprintf(fd," ymmRes1 = simde_mm256_adds_epi16(ymmRes1, ymm1);\n");
// Pack results back to epi8
fprintf(fd," ymm0 = simde_mm256_packs_epi16(ymmRes0, ymmRes1);\n");
// ymm0 = [ymmRes1[255:128] ymmRes0[255:128] ymmRes1[127:0] ymmRes0[127:0]]
// p_llrRes = [ymmRes1[255:128] ymmRes1[127:0] ymmRes0[255:128] ymmRes0[127:0]]
//fprintf(fd," (__m256i*) &llrRes[%d + i] = simde_mm256_permute4x64_epi64(ymm0, 0xD8);\n",lut_startAddrBnGroupsLlr[idxBnGroup]>>5 );
fprintf(fd,"}\n");
fprintf(fd," p_llrRes[i] = simde_mm256_permute4x64_epi64(ymm0, 0xD8);\n");
fprintf(fd,"}\n");
}
// =====================================================================
......@@ -407,27 +558,41 @@ fprintf(fd, "// Process group with 9 CNs \n");
cnOffsetInGroup = (lut_numBnInBnGroups[8]*NR_LDPC_ZMAX)>>4;
// Set pointers to start of group 2
fprintf(fd," p_bnProcBuf = (simde__m256i*) &bnProcBuf [%d];\n",lut_startAddrBnGroups[idxBnGroup]);
fprintf(fd," p_llrProcBuf = (simde__m256i*) &llrProcBuf [%d];\n",lut_startAddrBnGroupsLlr[idxBnGroup]);
fprintf(fd," p_llrRes = (simde__m256i*) &llrRes [%d];\n",lut_startAddrBnGroupsLlr[idxBnGroup]);
fprintf(fd," p_bnProcBuf = (__m128i*) &bnProcBuf [%d];\n",lut_startAddrBnGroups[idxBnGroup]);
fprintf(fd," p_llrProcBuf = (__m128i*) &llrProcBuf [%d];\n",lut_startAddrBnGroupsLlr[idxBnGroup]);
fprintf(fd," p_llrRes = (__m256i*) &llrRes [%d];\n",lut_startAddrBnGroupsLlr[idxBnGroup]);
// Loop over BNs
fprintf(fd," for (int i=0;i<M;i++) {\n");
fprintf(fd," for (int i=0,j=0;i<M;i++,j+=2) {\n");
// First 16 LLRs of first CN
fprintf(fd," p_llrRes[i] = p_bnProcBuf [i];\n");
fprintf(fd," ymmRes0 = simde_mm256_cvtepi8_epi16(p_bnProcBuf [j]);\n");
fprintf(fd," ymmRes1 = simde_mm256_cvtepi8_epi16(p_bnProcBuf [j +1]);\n");
// Loop over CNs
for (k=1; k<9; k++)
{
fprintf(fd," p_llrRes[i] = simde_mm256_adds_epi8(p_llrRes[i], p_bnProcBuf[%d + i]);\n",k*cnOffsetInGroup);
fprintf(fd," ymm0 = simde_mm256_cvtepi8_epi16(p_bnProcBuf[%d + j]);\n", k*cnOffsetInGroup);
fprintf(fd," ymmRes0 = simde_mm256_adds_epi16(ymmRes0, ymm0);\n");
fprintf(fd," ymm1 = simde_mm256_cvtepi8_epi16(p_bnProcBuf[%d + j +1]);\n", k*cnOffsetInGroup);
fprintf(fd, " ymmRes1 = simde_mm256_adds_epi16(ymmRes1, ymm1); \n");
}
// Add LLR from receiver input
fprintf(fd," p_llrRes[i] = simde_mm256_adds_epi8(p_llrRes[i], p_llrProcBuf[i]);\n");
fprintf(fd," ymm0 = simde_mm256_cvtepi8_epi16(p_llrProcBuf[j]);\n");
fprintf(fd," ymmRes0 = simde_mm256_adds_epi16(ymmRes0, ymm0);\n");
fprintf(fd," ymm1 = simde_mm256_cvtepi8_epi16(p_llrProcBuf[j +1 ]);\n");
fprintf(fd," ymmRes1 = simde_mm256_adds_epi16(ymmRes1, ymm1);\n");
// Pack results back to epi8
fprintf(fd," ymm0 = simde_mm256_packs_epi16(ymmRes0, ymmRes1);\n");
// ymm0 = [ymmRes1[255:128] ymmRes0[255:128] ymmRes1[127:0] ymmRes0[127:0]]
// p_llrRes = [ymmRes1[255:128] ymmRes1[127:0] ymmRes0[255:128] ymmRes0[127:0]]
//fprintf(fd," (__m256i*) &llrRes[%d + i] = simde_mm256_permute4x64_epi64(ymm0, 0xD8);\n",lut_startAddrBnGroupsLlr[idxBnGroup]>>5 );
fprintf(fd," p_llrRes[i] = simde_mm256_permute4x64_epi64(ymm0, 0xD8);\n");
fprintf(fd,"}\n");
}
......@@ -450,27 +615,40 @@ fprintf(fd, "// Process group with 10 CNs \n");
cnOffsetInGroup = (lut_numBnInBnGroups[9]*NR_LDPC_ZMAX)>>4;
// Set pointers to start of group 2
fprintf(fd," p_bnProcBuf = (simde__m256i*) &bnProcBuf [%d];\n",lut_startAddrBnGroups[idxBnGroup]);
fprintf(fd," p_llrProcBuf = (simde__m256i*) &llrProcBuf [%d];\n",lut_startAddrBnGroupsLlr[idxBnGroup]);
fprintf(fd," p_llrRes = (simde__m256i*) &llrRes [%d];\n",lut_startAddrBnGroupsLlr[idxBnGroup]);
fprintf(fd," p_bnProcBuf = (__m128i*) &bnProcBuf [%d];\n",lut_startAddrBnGroups[idxBnGroup]);
fprintf(fd," p_llrProcBuf = (__m128i*) &llrProcBuf [%d];\n",lut_startAddrBnGroupsLlr[idxBnGroup]);
fprintf(fd," p_llrRes = (__m256i*) &llrRes [%d];\n",lut_startAddrBnGroupsLlr[idxBnGroup]);
// Loop over BNs
fprintf(fd," for (int i=0;i<M;i++) {\n");
fprintf(fd," for (int i=0,j=0;i<M;i++,j+=2) {\n");
// First 16 LLRs of first CN
fprintf(fd," p_llrRes[i] = p_bnProcBuf [i];\n");
fprintf(fd," ymmRes0 = simde_mm256_cvtepi8_epi16(p_bnProcBuf [j]);\n");
fprintf(fd," ymmRes1 = simde_mm256_cvtepi8_epi16(p_bnProcBuf [j +1]);\n");
// Loop over CNs
for (k=1; k<10; k++)
{
fprintf(fd," p_llrRes[i] = simde_mm256_adds_epi8(p_llrRes[i], p_bnProcBuf[%d + i]);\n",k*cnOffsetInGroup);
fprintf(fd," ymm0 = simde_mm256_cvtepi8_epi16(p_bnProcBuf[%d + j]);\n", k*cnOffsetInGroup);
fprintf(fd," ymmRes0 = simde_mm256_adds_epi16(ymmRes0, ymm0);\n");
fprintf(fd," ymm1 = simde_mm256_cvtepi8_epi16(p_bnProcBuf[%d + j +1]);\n", k*cnOffsetInGroup);
fprintf(fd, " ymmRes1 = simde_mm256_adds_epi16(ymmRes1, ymm1); \n");
}
// Add LLR from receiver input
fprintf(fd," p_llrRes[i] = simde_mm256_adds_epi8(p_llrRes[i], p_llrProcBuf[i]);\n");
fprintf(fd," ymm0 = simde_mm256_cvtepi8_epi16(p_llrProcBuf[j]);\n");
fprintf(fd," ymmRes0 = simde_mm256_adds_epi16(ymmRes0, ymm0);\n");
fprintf(fd," ymm1 = simde_mm256_cvtepi8_epi16(p_llrProcBuf[j +1 ]);\n");
fprintf(fd," ymmRes1 = simde_mm256_adds_epi16(ymmRes1, ymm1);\n");
// Pack results back to epi8
fprintf(fd," ymm0 = simde_mm256_packs_epi16(ymmRes0, ymmRes1);\n");
// ymm0 = [ymmRes1[255:128] ymmRes0[255:128] ymmRes1[127:0] ymmRes0[127:0]]
// p_llrRes = [ymmRes1[255:128] ymmRes1[127:0] ymmRes0[255:128] ymmRes0[127:0]]
fprintf(fd," p_llrRes[i] = simde_mm256_permute4x64_epi64(ymm0, 0xD8);\n");
fprintf(fd,"}\n");
}
......@@ -495,24 +673,38 @@ fprintf(fd, "// Process group with 11 CNs \n");
cnOffsetInGroup = (lut_numBnInBnGroups[10]*NR_LDPC_ZMAX)>>4;
// Set pointers to start of group 2
fprintf(fd," p_bnProcBuf = (simde__m256i*) &bnProcBuf [%d];\n",lut_startAddrBnGroups[idxBnGroup]);
fprintf(fd," p_llrProcBuf = (simde__m256i*) &llrProcBuf [%d];\n",lut_startAddrBnGroupsLlr[idxBnGroup]);
fprintf(fd," p_llrRes = (simde__m256i*) &llrRes [%d];\n",lut_startAddrBnGroupsLlr[idxBnGroup]);
fprintf(fd," p_bnProcBuf = (__m128i*) &bnProcBuf [%d];\n",lut_startAddrBnGroups[idxBnGroup]);
fprintf(fd," p_llrProcBuf = (__m128i*) &llrProcBuf [%d];\n",lut_startAddrBnGroupsLlr[idxBnGroup]);
fprintf(fd," p_llrRes = (__m256i*) &llrRes [%d];\n",lut_startAddrBnGroupsLlr[idxBnGroup]);
// Loop over BNs
fprintf(fd," for (int i=0;i<M;i++) {\n");
fprintf(fd," for (int i=0,j=0;i<M;i++,j+=2) {\n");
// First 16 LLRs of first CN
fprintf(fd," p_llrRes[i] = p_bnProcBuf [i];\n");
fprintf(fd," ymmRes0 = simde_mm256_cvtepi8_epi16(p_bnProcBuf [j]);\n");
fprintf(fd," ymmRes1 = simde_mm256_cvtepi8_epi16(p_bnProcBuf [j +1]);\n");
// Loop over CNs
for (k=1; k<11; k++)
{
fprintf(fd," p_llrRes[i] = simde_mm256_adds_epi8(p_llrRes[i], p_bnProcBuf[%d + i]);\n",k*cnOffsetInGroup);
fprintf(fd," ymm0 = simde_mm256_cvtepi8_epi16(p_bnProcBuf[%d + j]);\n", k*cnOffsetInGroup);
fprintf(fd," ymmRes0 = simde_mm256_adds_epi16(ymmRes0, ymm0);\n");
fprintf(fd," ymm1 = simde_mm256_cvtepi8_epi16(p_bnProcBuf[%d + j +1]);\n", k*cnOffsetInGroup);
fprintf(fd, " ymmRes1 = simde_mm256_adds_epi16(ymmRes1, ymm1); \n");
}
// Add LLR from receiver input
fprintf(fd," p_llrRes[i] = simde_mm256_adds_epi8(p_llrRes[i], p_llrProcBuf[i]);\n");
fprintf(fd," ymm0 = simde_mm256_cvtepi8_epi16(p_llrProcBuf[j]);\n");
fprintf(fd," ymmRes0 = simde_mm256_adds_epi16(ymmRes0, ymm0);\n");
fprintf(fd," ymm1 = simde_mm256_cvtepi8_epi16(p_llrProcBuf[j +1 ]);\n");
fprintf(fd," ymmRes1 = simde_mm256_adds_epi16(ymmRes1, ymm1);\n");
// Pack results back to epi8
fprintf(fd," ymm0 = simde_mm256_packs_epi16(ymmRes0, ymmRes1);\n");
// ymm0 = [ymmRes1[255:128] ymmRes0[255:128] ymmRes1[127:0] ymmRes0[127:0]]
// p_llrRes = [ymmRes1[255:128] ymmRes1[127:0] ymmRes0[255:128] ymmRes0[127:0]]
fprintf(fd," p_llrRes[i] = simde_mm256_permute4x64_epi64(ymm0, 0xD8);\n");
fprintf(fd,"}\n");
}
......@@ -522,7 +714,7 @@ fprintf(fd, "// Process group with 11 CNs \n");
fprintf(fd, "// Process group with 12 CNs \n");
// Process group with 12 CNs
// Process group with 2 CNs
if (lut_numBnInBnGroups[11] > 0)
{
......@@ -536,24 +728,38 @@ fprintf(fd, "// Process group with 12 CNs \n");
cnOffsetInGroup = (lut_numBnInBnGroups[11]*NR_LDPC_ZMAX)>>4;
// Set pointers to start of group 2
fprintf(fd," p_bnProcBuf = (simde__m256i*) &bnProcBuf [%d];\n",lut_startAddrBnGroups[idxBnGroup]);
fprintf(fd," p_llrProcBuf = (simde__m256i*) &llrProcBuf [%d];\n",lut_startAddrBnGroupsLlr[idxBnGroup]);
fprintf(fd," p_llrRes = (simde__m256i*) &llrRes [%d];\n",lut_startAddrBnGroupsLlr[idxBnGroup]);
fprintf(fd," p_bnProcBuf = (__m128i*) &bnProcBuf [%d];\n",lut_startAddrBnGroups[idxBnGroup]);
fprintf(fd," p_llrProcBuf = (__m128i*) &llrProcBuf [%d];\n",lut_startAddrBnGroupsLlr[idxBnGroup]);
fprintf(fd," p_llrRes = (__m256i*) &llrRes [%d];\n",lut_startAddrBnGroupsLlr[idxBnGroup]);
// Loop over BNs
fprintf(fd," for (int i=0;i<M;i++) {\n");
fprintf(fd," for (int i=0,j=0;i<M;i++,j+=2) {\n");
// First 16 LLRs of first CN
fprintf(fd," p_llrRes[i] = p_bnProcBuf [i];\n");
fprintf(fd," ymmRes0 = simde_mm256_cvtepi8_epi16(p_bnProcBuf [j]);\n");
fprintf(fd," ymmRes1 = simde_mm256_cvtepi8_epi16(p_bnProcBuf [j +1]);\n");
// Loop over CNs
for (k=1; k<12; k++)
{
fprintf(fd," p_llrRes[i] = simde_mm256_adds_epi8(p_llrRes[i], p_bnProcBuf[%d + i]);\n",k*cnOffsetInGroup);
fprintf(fd," ymm0 = simde_mm256_cvtepi8_epi16(p_bnProcBuf[%d + j]);\n", k*cnOffsetInGroup);
fprintf(fd," ymmRes0 = simde_mm256_adds_epi16(ymmRes0, ymm0);\n");
fprintf(fd," ymm1 = simde_mm256_cvtepi8_epi16(p_bnProcBuf[%d + j +1]);\n", k*cnOffsetInGroup);
fprintf(fd, " ymmRes1 = simde_mm256_adds_epi16(ymmRes1, ymm1); \n");
}
// Add LLR from receiver input
fprintf(fd," p_llrRes[i] = simde_mm256_adds_epi8(p_llrRes[i], p_llrProcBuf[i]);\n");
fprintf(fd," ymm0 = simde_mm256_cvtepi8_epi16(p_llrProcBuf[j]);\n");
fprintf(fd," ymmRes0 = simde_mm256_adds_epi16(ymmRes0, ymm0);\n");
fprintf(fd," ymm1 = simde_mm256_cvtepi8_epi16(p_llrProcBuf[j +1 ]);\n");
fprintf(fd," ymmRes1 = simde_mm256_adds_epi16(ymmRes1, ymm1);\n");
// Pack results back to epi8
fprintf(fd," ymm0 = simde_mm256_packs_epi16(ymmRes0, ymmRes1);\n");
// ymm0 = [ymmRes1[255:128] ymmRes0[255:128] ymmRes1[127:0] ymmRes0[127:0]]
// p_llrRes = [ymmRes1[255:128] ymmRes1[127:0] ymmRes0[255:128] ymmRes0[127:0]]
fprintf(fd," p_llrRes[i] = simde_mm256_permute4x64_epi64(ymm0, 0xD8);\n");
fprintf(fd,"}\n");
}
......@@ -564,7 +770,7 @@ fprintf(fd, "// Process group with 12 CNs \n");
fprintf(fd, "// Process group with 13 CNs \n");
// Process group with 13 CNs
// Process group with 3 CNs
if (lut_numBnInBnGroups[12] > 0)
{
......@@ -578,24 +784,38 @@ fprintf(fd, "// Process group with 13 CNs \n");
cnOffsetInGroup = (lut_numBnInBnGroups[12]*NR_LDPC_ZMAX)>>4;
// Set pointers to start of group 2
fprintf(fd," p_bnProcBuf = (simde__m256i*) &bnProcBuf [%d];\n",lut_startAddrBnGroups[idxBnGroup]);
fprintf(fd," p_llrProcBuf = (simde__m256i*) &llrProcBuf [%d];\n",lut_startAddrBnGroupsLlr[idxBnGroup]);
fprintf(fd," p_llrRes = (simde__m256i*) &llrRes [%d];\n",lut_startAddrBnGroupsLlr[idxBnGroup]);
fprintf(fd," p_bnProcBuf = (__m128i*) &bnProcBuf [%d];\n",lut_startAddrBnGroups[idxBnGroup]);
fprintf(fd," p_llrProcBuf = (__m128i*) &llrProcBuf [%d];\n",lut_startAddrBnGroupsLlr[idxBnGroup]);
fprintf(fd," p_llrRes = (__m256i*) &llrRes [%d];\n",lut_startAddrBnGroupsLlr[idxBnGroup]);
// Loop over BNs
fprintf(fd," for (int i=0;i<M;i++) {\n");
fprintf(fd," for (int i=0,j=0;i<M;i++,j+=2) {\n");
// First 16 LLRs of first CN
fprintf(fd," p_llrRes[i] = p_bnProcBuf [i];\n");
fprintf(fd," ymmRes0 = simde_mm256_cvtepi8_epi16(p_bnProcBuf [j]);\n");
fprintf(fd," ymmRes1 = simde_mm256_cvtepi8_epi16(p_bnProcBuf [j +1]);\n");
// Loop over CNs
for (k=1; k<13; k++)
{
fprintf(fd," p_llrRes[i] = simde_mm256_adds_epi8(p_llrRes[i], p_bnProcBuf[%d + i]);\n",k*cnOffsetInGroup);
fprintf(fd," ymm0 = simde_mm256_cvtepi8_epi16(p_bnProcBuf[%d + j]);\n", k*cnOffsetInGroup);
fprintf(fd," ymmRes0 = simde_mm256_adds_epi16(ymmRes0, ymm0);\n");
fprintf(fd," ymm1 = simde_mm256_cvtepi8_epi16(p_bnProcBuf[%d + j +1]);\n", k*cnOffsetInGroup);
fprintf(fd," ymmRes1 = simde_mm256_adds_epi16(ymmRes1, ymm1); \n");
}
// Add LLR from receiver input
fprintf(fd," p_llrRes[i] = simde_mm256_adds_epi8(p_llrRes[i], p_llrProcBuf[i]);\n");
fprintf(fd," ymm0 = simde_mm256_cvtepi8_epi16(p_llrProcBuf[j]);\n");
fprintf(fd," ymmRes0 = simde_mm256_adds_epi16(ymmRes0, ymm0);\n");
fprintf(fd," ymm1 = simde_mm256_cvtepi8_epi16(p_llrProcBuf[j +1 ]);\n");
fprintf(fd," ymmRes1 = simde_mm256_adds_epi16(ymmRes1, ymm1);\n");
// Pack results back to epi8
fprintf(fd," ymm0 = simde_mm256_packs_epi16(ymmRes0, ymmRes1);\n");
// ymm0 = [ymmRes1[255:128] ymmRes0[255:128] ymmRes1[127:0] ymmRes0[127:0]]
// p_llrRes = [ymmRes1[255:128] ymmRes1[127:0] ymmRes0[255:128] ymmRes0[127:0]]
fprintf(fd," p_llrRes[i] = simde_mm256_permute4x64_epi64(ymm0, 0xD8);\n");
fprintf(fd,"}\n");
}
......@@ -603,7 +823,7 @@ fprintf(fd, "// Process group with 13 CNs \n");
// =====================================================================
// Process group with 14 CNs
// Process group with 4 CNs
fprintf(fd, "// Process group with 14 CNs \n");
......@@ -621,24 +841,38 @@ fprintf(fd, "// Process group with 14 CNs \n");
cnOffsetInGroup = (lut_numBnInBnGroups[13]*NR_LDPC_ZMAX)>>4;
// Set pointers to start of group 2
fprintf(fd," p_bnProcBuf = (simde__m256i*) &bnProcBuf [%d];\n",lut_startAddrBnGroups[idxBnGroup]);
fprintf(fd," p_llrProcBuf = (simde__m256i*) &llrProcBuf [%d];\n",lut_startAddrBnGroupsLlr[idxBnGroup]);
fprintf(fd," p_llrRes = (simde__m256i*) &llrRes [%d];\n",lut_startAddrBnGroupsLlr[idxBnGroup]);
fprintf(fd," p_bnProcBuf = (__m128i*) &bnProcBuf [%d];\n",lut_startAddrBnGroups[idxBnGroup]);
fprintf(fd," p_llrProcBuf = (__m128i*) &llrProcBuf [%d];\n",lut_startAddrBnGroupsLlr[idxBnGroup]);
fprintf(fd," p_llrRes = (__m256i*) &llrRes [%d];\n",lut_startAddrBnGroupsLlr[idxBnGroup]);
// Loop over BNs
fprintf(fd," for (int i=0;i<M;i++) {\n");
fprintf(fd," for (int i=0,j=0;i<M;i++,j+=2) {\n");
// First 16 LLRs of first CN
fprintf(fd," p_llrRes[i] = p_bnProcBuf [i];\n");
fprintf(fd," ymmRes0 = simde_mm256_cvtepi8_epi16(p_bnProcBuf [j]);\n");
fprintf(fd," ymmRes1 = simde_mm256_cvtepi8_epi16(p_bnProcBuf [j +1]);\n");
// Loop over CNs
for (k=1; k<14; k++)
{
fprintf(fd," p_llrRes[i] = simde_mm256_adds_epi8(p_llrRes[i], p_bnProcBuf[%d + i]);\n",k*cnOffsetInGroup);
fprintf(fd," ymm0 = simde_mm256_cvtepi8_epi16(p_bnProcBuf[%d + j]);\n", k*cnOffsetInGroup);
fprintf(fd," ymmRes0 = simde_mm256_adds_epi16(ymmRes0, ymm0);\n");
fprintf(fd," ymm1 = simde_mm256_cvtepi8_epi16(p_bnProcBuf[%d + j +1]);\n", k*cnOffsetInGroup);
fprintf(fd, " ymmRes1 = simde_mm256_adds_epi16(ymmRes1, ymm1); \n");
}
// Add LLR from receiver input
fprintf(fd," p_llrRes[i] = simde_mm256_adds_epi8(p_llrRes[i], p_llrProcBuf[i]);\n");
fprintf(fd," ymm0 = simde_mm256_cvtepi8_epi16(p_llrProcBuf[j]);\n");
fprintf(fd," ymmRes0 = simde_mm256_adds_epi16(ymmRes0, ymm0);\n");
fprintf(fd," ymm1 = simde_mm256_cvtepi8_epi16(p_llrProcBuf[j +1 ]);\n");
fprintf(fd," ymmRes1 = simde_mm256_adds_epi16(ymmRes1, ymm1);\n");
// Pack results back to epi8
fprintf(fd," ymm0 = simde_mm256_packs_epi16(ymmRes0, ymmRes1);\n");
// ymm0 = [ymmRes1[255:128] ymmRes0[255:128] ymmRes1[127:0] ymmRes0[127:0]]
// p_llrRes = [ymmRes1[255:128] ymmRes1[127:0] ymmRes0[255:128] ymmRes0[127:0]]
fprintf(fd," p_llrRes[i] = simde_mm256_permute4x64_epi64(ymm0, 0xD8);\n");
fprintf(fd,"}\n");
}
......@@ -663,24 +897,38 @@ fprintf(fd, "// Process group with 15 CNs \n");
cnOffsetInGroup = (lut_numBnInBnGroups[14]*NR_LDPC_ZMAX)>>4;
// Set pointers to start of group 2
fprintf(fd," p_bnProcBuf = (simde__m256i*) &bnProcBuf [%d];\n",lut_startAddrBnGroups[idxBnGroup]);
fprintf(fd," p_llrProcBuf = (simde__m256i*) &llrProcBuf [%d];\n",lut_startAddrBnGroupsLlr[idxBnGroup]);
fprintf(fd," p_llrRes = (simde__m256i*) &llrRes [%d];\n",lut_startAddrBnGroupsLlr[idxBnGroup]);
fprintf(fd," p_bnProcBuf = (__m128i*) &bnProcBuf [%d];\n",lut_startAddrBnGroups[idxBnGroup]);
fprintf(fd," p_llrProcBuf = (__m128i*) &llrProcBuf [%d];\n",lut_startAddrBnGroupsLlr[idxBnGroup]);
fprintf(fd," p_llrRes = (__m256i*) &llrRes [%d];\n",lut_startAddrBnGroupsLlr[idxBnGroup]);
// Loop over BNs
fprintf(fd," for (int i=0;i<M;i++) {\n");
fprintf(fd," for (int i=0,j=0;i<M;i++,j+=2) {\n");
// First 16 LLRs of first CN
fprintf(fd," p_llrRes[i] = p_bnProcBuf [i];\n");
fprintf(fd," ymmRes0 = simde_mm256_cvtepi8_epi16(p_bnProcBuf [j]);\n");
fprintf(fd," ymmRes1 = simde_mm256_cvtepi8_epi16(p_bnProcBuf [j +1]);\n");
// Loop over CNs
for (k=1; k<15; k++)
{
fprintf(fd," p_llrRes[i] = simde_mm256_adds_epi8(p_llrRes[i], p_bnProcBuf[%d + i]);\n",k*cnOffsetInGroup);
fprintf(fd," ymm0 = simde_mm256_cvtepi8_epi16(p_bnProcBuf[%d + j]);\n", k*cnOffsetInGroup);
fprintf(fd," ymmRes0 = simde_mm256_adds_epi16(ymmRes0, ymm0);\n");
fprintf(fd," ymm1 = simde_mm256_cvtepi8_epi16(p_bnProcBuf[%d + j +1]);\n", k*cnOffsetInGroup);
fprintf(fd, " ymmRes1 = simde_mm256_adds_epi16(ymmRes1, ymm1); \n");
}
// Add LLR from receiver input
fprintf(fd," p_llrRes[i] = simde_mm256_adds_epi8(p_llrRes[i], p_llrProcBuf[i]);\n");
fprintf(fd," ymm0 = simde_mm256_cvtepi8_epi16(p_llrProcBuf[j]);\n");
fprintf(fd," ymmRes0 = simde_mm256_adds_epi16(ymmRes0, ymm0);\n");
fprintf(fd," ymm1 = simde_mm256_cvtepi8_epi16(p_llrProcBuf[j +1 ]);\n");
fprintf(fd," ymmRes1 = simde_mm256_adds_epi16(ymmRes1, ymm1);\n");
// Pack results back to epi8
fprintf(fd," ymm0 = simde_mm256_packs_epi16(ymmRes0, ymmRes1);\n");
// ymm0 = [ymmRes1[255:128] ymmRes0[255:128] ymmRes1[127:0] ymmRes0[127:0]]
// p_llrRes = [ymmRes1[255:128] ymmRes1[127:0] ymmRes0[255:128] ymmRes0[127:0]]
fprintf(fd," p_llrRes[i] = simde_mm256_permute4x64_epi64(ymm0, 0xD8);\n");
fprintf(fd,"}\n");
}
......@@ -706,24 +954,38 @@ fprintf(fd, "// Process group with 16 CNs \n");
cnOffsetInGroup = (lut_numBnInBnGroups[15]*NR_LDPC_ZMAX)>>4;
// Set pointers to start of group 2
fprintf(fd," p_bnProcBuf = (simde__m256i*) &bnProcBuf [%d];\n",lut_startAddrBnGroups[idxBnGroup]);
fprintf(fd," p_llrProcBuf = (simde__m256i*) &llrProcBuf [%d];\n",lut_startAddrBnGroupsLlr[idxBnGroup]);
fprintf(fd," p_llrRes = (simde__m256i*) &llrRes [%d];\n",lut_startAddrBnGroupsLlr[idxBnGroup]);
fprintf(fd," p_bnProcBuf = (__m128i*) &bnProcBuf [%d];\n",lut_startAddrBnGroups[idxBnGroup]);
fprintf(fd," p_llrProcBuf = (__m128i*) &llrProcBuf [%d];\n",lut_startAddrBnGroupsLlr[idxBnGroup]);
fprintf(fd," p_llrRes = (__m256i*) &llrRes [%d];\n",lut_startAddrBnGroupsLlr[idxBnGroup]);
// Loop over BNs
fprintf(fd," for (int i=0;i<M;i++) {\n");
fprintf(fd," for (int i=0,j=0;i<M;i++,j+=2) {\n");
// First 16 LLRs of first CN
fprintf(fd," p_llrRes[i] = p_bnProcBuf [i];\n");
fprintf(fd," ymmRes0 = simde_mm256_cvtepi8_epi16(p_bnProcBuf [j]);\n");
fprintf(fd," ymmRes1 = simde_mm256_cvtepi8_epi16(p_bnProcBuf [j +1]);\n");
// Loop over CNs
for (k=1; k<16; k++)
{
fprintf(fd," p_llrRes[i] = simde_mm256_adds_epi8(p_llrRes[i], p_bnProcBuf[%d + i]);\n",k*cnOffsetInGroup);
fprintf(fd," ymm0 = simde_mm256_cvtepi8_epi16(p_bnProcBuf[%d + j]);\n", k*cnOffsetInGroup);
fprintf(fd," ymmRes0 = simde_mm256_adds_epi16(ymmRes0, ymm0);\n");
fprintf(fd," ymm1 = simde_mm256_cvtepi8_epi16(p_bnProcBuf[%d + j +1]);\n", k*cnOffsetInGroup);
fprintf(fd, " ymmRes1 = simde_mm256_adds_epi16(ymmRes1, ymm1); \n");
}
// Add LLR from receiver input
fprintf(fd," p_llrRes[i] = simde_mm256_adds_epi8(p_llrRes[i], p_llrProcBuf[i]);\n");
fprintf(fd," ymm0 = simde_mm256_cvtepi8_epi16(p_llrProcBuf[j]);\n");
fprintf(fd," ymmRes0 = simde_mm256_adds_epi16(ymmRes0, ymm0);\n");
fprintf(fd," ymm1 = simde_mm256_cvtepi8_epi16(p_llrProcBuf[j +1 ]);\n");
fprintf(fd," ymmRes1 = simde_mm256_adds_epi16(ymmRes1, ymm1);\n");
// Pack results back to epi8
fprintf(fd," ymm0 = simde_mm256_packs_epi16(ymmRes0, ymmRes1);\n");
// ymm0 = [ymmRes1[255:128] ymmRes0[255:128] ymmRes1[127:0] ymmRes0[127:0]]
// p_llrRes = [ymmRes1[255:128] ymmRes1[127:0] ymmRes0[255:128] ymmRes0[127:0]]
fprintf(fd," p_llrRes[i] = simde_mm256_permute4x64_epi64(ymm0, 0xD8);\n");
fprintf(fd,"}\n");
}
......@@ -748,24 +1010,38 @@ fprintf(fd, "// Process group with 17 CNs \n");
cnOffsetInGroup = (lut_numBnInBnGroups[16]*NR_LDPC_ZMAX)>>4;
// Set pointers to start of group 2
fprintf(fd," p_bnProcBuf = (simde__m256i*) &bnProcBuf [%d];\n",lut_startAddrBnGroups[idxBnGroup]);
fprintf(fd," p_llrProcBuf = (simde__m256i*) &llrProcBuf [%d];\n",lut_startAddrBnGroupsLlr[idxBnGroup]);
fprintf(fd," p_llrRes = (simde__m256i*) &llrRes [%d];\n",lut_startAddrBnGroupsLlr[idxBnGroup]);
fprintf(fd," p_bnProcBuf = (__m128i*) &bnProcBuf [%d];\n",lut_startAddrBnGroups[idxBnGroup]);
fprintf(fd," p_llrProcBuf = (__m128i*) &llrProcBuf [%d];\n",lut_startAddrBnGroupsLlr[idxBnGroup]);
fprintf(fd," p_llrRes = (__m256i*) &llrRes [%d];\n",lut_startAddrBnGroupsLlr[idxBnGroup]);
// Loop over BNs
fprintf(fd," for (int i=0;i<M;i++) {\n");
fprintf(fd," for (int i=0,j=0;i<M;i++,j+=2) {\n");
// First 16 LLRs of first CN
fprintf(fd," p_llrRes[i] = p_bnProcBuf [i];\n");
fprintf(fd," ymmRes0 = simde_mm256_cvtepi8_epi16(p_bnProcBuf [j]);\n");
fprintf(fd," ymmRes1 = simde_mm256_cvtepi8_epi16(p_bnProcBuf [j +1]);\n");
// Loop over CNs
for (k=1; k<17; k++)
{
fprintf(fd," p_llrRes[i] = simde_mm256_adds_epi8(p_llrRes[i], p_bnProcBuf[%d + i]);\n",k*cnOffsetInGroup);
fprintf(fd," ymm0 = simde_mm256_cvtepi8_epi16(p_bnProcBuf[%d + j]);\n", k*cnOffsetInGroup);
fprintf(fd," ymmRes0 = simde_mm256_adds_epi16(ymmRes0, ymm0);\n");
fprintf(fd," ymm1 = simde_mm256_cvtepi8_epi16(p_bnProcBuf[%d + j +1]);\n", k*cnOffsetInGroup);
fprintf(fd, " ymmRes1 = simde_mm256_adds_epi16(ymmRes1, ymm1); \n");
}
// Add LLR from receiver input
fprintf(fd," p_llrRes[i] = simde_mm256_adds_epi8(p_llrRes[i], p_llrProcBuf[i]);\n");
fprintf(fd," ymm0 = simde_mm256_cvtepi8_epi16(p_llrProcBuf[j]);\n");
fprintf(fd," ymmRes0 = simde_mm256_adds_epi16(ymmRes0, ymm0);\n");
fprintf(fd," ymm1 = simde_mm256_cvtepi8_epi16(p_llrProcBuf[j +1 ]);\n");
fprintf(fd," ymmRes1 = simde_mm256_adds_epi16(ymmRes1, ymm1);\n");
// Pack results back to epi8
fprintf(fd," ymm0 = simde_mm256_packs_epi16(ymmRes0, ymmRes1);\n");
// ymm0 = [ymmRes1[255:128] ymmRes0[255:128] ymmRes1[127:0] ymmRes0[127:0]]
// p_llrRes = [ymmRes1[255:128] ymmRes1[127:0] ymmRes0[255:128] ymmRes0[127:0]]
fprintf(fd," p_llrRes[i] = simde_mm256_permute4x64_epi64(ymm0, 0xD8);\n");
fprintf(fd,"}\n");
}
......@@ -790,29 +1066,44 @@ fprintf(fd, "// Process group with 18 CNs \n");
cnOffsetInGroup = (lut_numBnInBnGroups[17]*NR_LDPC_ZMAX)>>4;
// Set pointers to start of group 2
fprintf(fd," p_bnProcBuf = (simde__m256i*) &bnProcBuf [%d];\n",lut_startAddrBnGroups[idxBnGroup]);
fprintf(fd," p_llrProcBuf = (simde__m256i*) &llrProcBuf [%d];\n",lut_startAddrBnGroupsLlr[idxBnGroup]);
fprintf(fd," p_llrRes = (simde__m256i*) &llrRes [%d];\n",lut_startAddrBnGroupsLlr[idxBnGroup]);
fprintf(fd," for (int i=0;i<M;i++) {\n");
fprintf(fd," p_bnProcBuf = (__m128i*) &bnProcBuf [%d];\n",lut_startAddrBnGroups[idxBnGroup]);
fprintf(fd," p_llrProcBuf = (__m128i*) &llrProcBuf [%d];\n",lut_startAddrBnGroupsLlr[idxBnGroup]);
fprintf(fd," p_llrRes = (__m256i*) &llrRes [%d];\n",lut_startAddrBnGroupsLlr[idxBnGroup]);
// Loop over BNs
fprintf(fd," for (int i=0,j=0;i<M;i++,j+=2) {\n");
// First 16 LLRs of first CN
fprintf(fd," p_llrRes[i] = p_bnProcBuf [i];\n");
fprintf(fd," ymmRes0 = simde_mm256_cvtepi8_epi16(p_bnProcBuf [j]);\n");
fprintf(fd," ymmRes1 = simde_mm256_cvtepi8_epi16(p_bnProcBuf [j +1]);\n");
// Loop over CNs
for (k=1; k<18; k++)
{
fprintf(fd," p_llrRes[i] = simde_mm256_adds_epi8(p_llrRes[i], p_bnProcBuf[%d + i]);\n",k*cnOffsetInGroup);
fprintf(fd," ymm0 = simde_mm256_cvtepi8_epi16(p_bnProcBuf[%d + j]);\n", k*cnOffsetInGroup);
fprintf(fd," ymmRes0 = simde_mm256_adds_epi16(ymmRes0, ymm0);\n");
fprintf(fd," ymm1 = simde_mm256_cvtepi8_epi16(p_bnProcBuf[%d + j +1]);\n", k*cnOffsetInGroup);
fprintf(fd, " ymmRes1 = simde_mm256_adds_epi16(ymmRes1, ymm1); \n");
}
// Add LLR from receiver input
fprintf(fd," p_llrRes[i] = simde_mm256_adds_epi8(p_llrRes[i], p_llrProcBuf[i]);\n");
fprintf(fd," ymm0 = simde_mm256_cvtepi8_epi16(p_llrProcBuf[j]);\n");
fprintf(fd," ymmRes0 = simde_mm256_adds_epi16(ymmRes0, ymm0);\n");
fprintf(fd," ymm1 = simde_mm256_cvtepi8_epi16(p_llrProcBuf[j +1 ]);\n");
fprintf(fd," ymmRes1 = simde_mm256_adds_epi16(ymmRes1, ymm1);\n");
// Pack results back to epi8
fprintf(fd," ymm0 = simde_mm256_packs_epi16(ymmRes0, ymmRes1);\n");
// ymm0 = [ymmRes1[255:128] ymmRes0[255:128] ymmRes1[127:0] ymmRes0[127:0]]
// p_llrRes = [ymmRes1[255:128] ymmRes1[127:0] ymmRes0[255:128] ymmRes0[127:0]]
fprintf(fd," p_llrRes[i] = simde_mm256_permute4x64_epi64(ymm0, 0xD8);\n");
fprintf(fd,"}\n");
}
// =====================================================================
// Process group with 19 CNs
// Process group with 9 CNs
fprintf(fd, "// Process group with 19 CNs \n");
......@@ -830,24 +1121,38 @@ fprintf(fd, "// Process group with 19 CNs \n");
cnOffsetInGroup = (lut_numBnInBnGroups[18]*NR_LDPC_ZMAX)>>4;
// Set pointers to start of group 2
fprintf(fd," p_bnProcBuf = (simde__m256i*) &bnProcBuf [%d];\n",lut_startAddrBnGroups[idxBnGroup]);
fprintf(fd," p_llrProcBuf = (simde__m256i*) &llrProcBuf [%d];\n",lut_startAddrBnGroupsLlr[idxBnGroup]);
fprintf(fd," p_llrRes = (simde__m256i*) &llrRes [%d];\n",lut_startAddrBnGroupsLlr[idxBnGroup]);
// First 16 LLRs of first CN
fprintf(fd," p_bnProcBuf = (__m128i*) &bnProcBuf [%d];\n",lut_startAddrBnGroups[idxBnGroup]);
fprintf(fd," p_llrProcBuf = (__m128i*) &llrProcBuf [%d];\n",lut_startAddrBnGroupsLlr[idxBnGroup]);
fprintf(fd," p_llrRes = (__m256i*) &llrRes [%d];\n",lut_startAddrBnGroupsLlr[idxBnGroup]);
// Loop over BNs
fprintf(fd," for (int i=0;i<M;i++) {\n");
fprintf(fd," p_llrRes[i] = p_bnProcBuf [i];\n");
fprintf(fd," for (int i=0,j=0;i<M;i++,j+=2) {\n");
// First 16 LLRs of first CN
fprintf(fd," ymmRes0 = simde_mm256_cvtepi8_epi16(p_bnProcBuf [j]);\n");
fprintf(fd," ymmRes1 = simde_mm256_cvtepi8_epi16(p_bnProcBuf [j +1]);\n");
// Loop over CNs
for (k=1; k<19; k++)
{
fprintf(fd," p_llrRes[i] = simde_mm256_adds_epi8(p_llrRes[i], p_bnProcBuf[%d + i]);\n",k*cnOffsetInGroup);
fprintf(fd," ymm0 = simde_mm256_cvtepi8_epi16(p_bnProcBuf[%d + j]);\n", k*cnOffsetInGroup);
fprintf(fd," ymmRes0 = simde_mm256_adds_epi16(ymmRes0, ymm0);\n");
fprintf(fd," ymm1 = simde_mm256_cvtepi8_epi16(p_bnProcBuf[%d + j +1]);\n", k*cnOffsetInGroup);
fprintf(fd, " ymmRes1 = simde_mm256_adds_epi16(ymmRes1, ymm1); \n");
}
// Add LLR from receiver input
fprintf(fd," p_llrRes[i] = simde_mm256_adds_epi8(p_llrRes[i], p_llrProcBuf[i]);\n");
fprintf(fd," ymm0 = simde_mm256_cvtepi8_epi16(p_llrProcBuf[j]);\n");
fprintf(fd," ymmRes0 = simde_mm256_adds_epi16(ymmRes0, ymm0);\n");
fprintf(fd," ymm1 = simde_mm256_cvtepi8_epi16(p_llrProcBuf[j +1 ]);\n");
fprintf(fd," ymmRes1 = simde_mm256_adds_epi16(ymmRes1, ymm1);\n");
// Pack results back to epi8
fprintf(fd," ymm0 = simde_mm256_packs_epi16(ymmRes0, ymmRes1);\n");
// ymm0 = [ymmRes1[255:128] ymmRes0[255:128] ymmRes1[127:0] ymmRes0[127:0]]
// p_llrRes = [ymmRes1[255:128] ymmRes1[127:0] ymmRes0[255:128] ymmRes0[127:0]]
fprintf(fd," p_llrRes[i] = simde_mm256_permute4x64_epi64(ymm0, 0xD8);\n");
fprintf(fd,"}\n");
}
......@@ -872,24 +1177,38 @@ fprintf(fd, "// Process group with 20 CNs \n");
cnOffsetInGroup = (lut_numBnInBnGroups[19]*NR_LDPC_ZMAX)>>4;
// Set pointers to start of group 2
fprintf(fd," p_bnProcBuf = (simde__m256i*) &bnProcBuf [%d];\n",lut_startAddrBnGroups[idxBnGroup]);
fprintf(fd," p_llrProcBuf = (simde__m256i*) &llrProcBuf [%d];\n",lut_startAddrBnGroupsLlr[idxBnGroup]);
fprintf(fd," p_llrRes = (simde__m256i*) &llrRes [%d];\n",lut_startAddrBnGroupsLlr[idxBnGroup]);
// First 16 LLRs of first CN
fprintf(fd," p_bnProcBuf = (__m128i*) &bnProcBuf [%d];\n",lut_startAddrBnGroups[idxBnGroup]);
fprintf(fd," p_llrProcBuf = (__m128i*) &llrProcBuf [%d];\n",lut_startAddrBnGroupsLlr[idxBnGroup]);
fprintf(fd," p_llrRes = (__m256i*) &llrRes [%d];\n",lut_startAddrBnGroupsLlr[idxBnGroup]);
// Loop over BNs
fprintf(fd," for (int i=0;i<M;i++) {\n");
fprintf(fd," p_llrRes[i] = p_bnProcBuf [i];\n");
fprintf(fd," for (int i=0,j=0;i<M;i++,j+=2) {\n");
// First 16 LLRs of first CN
fprintf(fd," ymmRes0 = simde_mm256_cvtepi8_epi16(p_bnProcBuf [j]);\n");
fprintf(fd," ymmRes1 = simde_mm256_cvtepi8_epi16(p_bnProcBuf [j +1]);\n");
// Loop over CNs
for (k=1; k<20; k++)
{
fprintf(fd," p_llrRes[i] = simde_mm256_adds_epi8(p_llrRes[i], p_bnProcBuf[%d + i]);\n",k*cnOffsetInGroup);
fprintf(fd," ymm0 = simde_mm256_cvtepi8_epi16(p_bnProcBuf[%d + j]);\n", k*cnOffsetInGroup);
fprintf(fd," ymmRes0 = simde_mm256_adds_epi16(ymmRes0, ymm0);\n");
fprintf(fd," ymm1 = simde_mm256_cvtepi8_epi16(p_bnProcBuf[%d + j +1]);\n", k*cnOffsetInGroup);
fprintf(fd, " ymmRes1 = simde_mm256_adds_epi16(ymmRes1, ymm1); \n");
}
// Add LLR from receiver input
fprintf(fd," p_llrRes[i] = simde_mm256_adds_epi8(p_llrRes[i], p_llrProcBuf[i]);\n");
fprintf(fd," ymm0 = simde_mm256_cvtepi8_epi16(p_llrProcBuf[j]);\n");
fprintf(fd," ymmRes0 = simde_mm256_adds_epi16(ymmRes0, ymm0);\n");
fprintf(fd," ymm1 = simde_mm256_cvtepi8_epi16(p_llrProcBuf[j +1 ]);\n");
fprintf(fd," ymmRes1 = simde_mm256_adds_epi16(ymmRes1, ymm1);\n");
// Pack results back to epi8
fprintf(fd," ymm0 = simde_mm256_packs_epi16(ymmRes0, ymmRes1);\n");
// ymm0 = [ymmRes1[255:128] ymmRes0[255:128] ymmRes1[127:0] ymmRes0[127:0]]
// p_llrRes = [ymmRes1[255:128] ymmRes1[127:0] ymmRes0[255:128] ymmRes0[127:0]]
fprintf(fd," p_llrRes[i] = simde_mm256_permute4x64_epi64(ymm0, 0xD8);\n");
fprintf(fd,"}\n");
}
......@@ -918,24 +1237,38 @@ fprintf(fd, "// Process group with 21 CNs \n");
cnOffsetInGroup = (lut_numBnInBnGroups[20]*NR_LDPC_ZMAX)>>4;
// Set pointers to start of group 2
fprintf(fd," p_bnProcBuf = (simde__m256i*) &bnProcBuf [%d];\n",lut_startAddrBnGroups[idxBnGroup]);
fprintf(fd," p_llrProcBuf = (simde__m256i*) &llrProcBuf [%d];\n",lut_startAddrBnGroupsLlr[idxBnGroup]);
fprintf(fd," p_llrRes = (simde__m256i*) &llrRes [%d];\n",lut_startAddrBnGroupsLlr[idxBnGroup]);
// First 16 LLRs of first CN
fprintf(fd," p_bnProcBuf = (__m128i*) &bnProcBuf [%d];\n",lut_startAddrBnGroups[idxBnGroup]);
fprintf(fd," p_llrProcBuf = (__m128i*) &llrProcBuf [%d];\n",lut_startAddrBnGroupsLlr[idxBnGroup]);
fprintf(fd," p_llrRes = (__m256i*) &llrRes [%d];\n",lut_startAddrBnGroupsLlr[idxBnGroup]);
// Loop over BNs
fprintf(fd," for (int i=0;i<M;i++) {\n");
fprintf(fd," p_llrRes[i] = p_bnProcBuf [i];\n");
fprintf(fd," for (int i=0,j=0;i<M;i++,j+=2) {\n");
// First 16 LLRs of first CN
fprintf(fd," ymmRes0 = simde_mm256_cvtepi8_epi16(p_bnProcBuf [j]);\n");
fprintf(fd," ymmRes1 = simde_mm256_cvtepi8_epi16(p_bnProcBuf [j +1]);\n");
// Loop over CNs
for (k=1; k<21; k++)
{
fprintf(fd," p_llrRes[i] = simde_mm256_adds_epi8(p_llrRes[i], p_bnProcBuf[%d + i]);\n",k*cnOffsetInGroup);
fprintf(fd," ymm0 = simde_mm256_cvtepi8_epi16(p_bnProcBuf[%d + j]);\n", k*cnOffsetInGroup);
fprintf(fd," ymmRes0 = simde_mm256_adds_epi16(ymmRes0, ymm0);\n");
fprintf(fd," ymm1 = simde_mm256_cvtepi8_epi16(p_bnProcBuf[%d + j +1]);\n", k*cnOffsetInGroup);
fprintf(fd, " ymmRes1 = simde_mm256_adds_epi16(ymmRes1, ymm1); \n");
}
// Add LLR from receiver input
fprintf(fd," p_llrRes[i] = simde_mm256_adds_epi8(p_llrRes[i], p_llrProcBuf[i]);\n");
fprintf(fd," ymm0 = simde_mm256_cvtepi8_epi16(p_llrProcBuf[j]);\n");
fprintf(fd," ymmRes0 = simde_mm256_adds_epi16(ymmRes0, ymm0);\n");
fprintf(fd," ymm1 = simde_mm256_cvtepi8_epi16(p_llrProcBuf[j +1 ]);\n");
fprintf(fd," ymmRes1 = simde_mm256_adds_epi16(ymmRes1, ymm1);\n");
// Pack results back to epi8
fprintf(fd," ymm0 = simde_mm256_packs_epi16(ymmRes0, ymmRes1);\n");
// ymm0 = [ymmRes1[255:128] ymmRes0[255:128] ymmRes1[127:0] ymmRes0[127:0]]
// p_llrRes = [ymmRes1[255:128] ymmRes1[127:0] ymmRes0[255:128] ymmRes0[127:0]]
fprintf(fd," p_llrRes[i] = simde_mm256_permute4x64_epi64(ymm0, 0xD8);\n");
fprintf(fd,"}\n");
}
......@@ -959,30 +1292,44 @@ fprintf(fd, "// Process group with 22 CNs \n");
cnOffsetInGroup = (lut_numBnInBnGroups[21]*NR_LDPC_ZMAX)>>4;
// Set pointers to start of group 2
fprintf(fd," p_bnProcBuf = (simde__m256i*) &bnProcBuf [%d];\n",lut_startAddrBnGroups[idxBnGroup]);
fprintf(fd," p_llrProcBuf = (simde__m256i*) &llrProcBuf [%d];\n",lut_startAddrBnGroupsLlr[idxBnGroup]);
fprintf(fd," p_llrRes = (simde__m256i*) &llrRes [%d];\n",lut_startAddrBnGroupsLlr[idxBnGroup]);
// First 16 LLRs of first CN
fprintf(fd," p_bnProcBuf = (__m128i*) &bnProcBuf [%d];\n",lut_startAddrBnGroups[idxBnGroup]);
fprintf(fd," p_llrProcBuf = (__m128i*) &llrProcBuf [%d];\n",lut_startAddrBnGroupsLlr[idxBnGroup]);
fprintf(fd," p_llrRes = (__m256i*) &llrRes [%d];\n",lut_startAddrBnGroupsLlr[idxBnGroup]);
// Loop over BNs
fprintf(fd," for (int i=0;i<M;i++) {\n");
fprintf(fd," p_llrRes[i] = p_bnProcBuf [i];\n");
fprintf(fd," for (int i=0,j=0;i<M;i++,j+=2) {\n");
// First 16 LLRs of first CN
fprintf(fd," ymmRes0 = simde_mm256_cvtepi8_epi16(p_bnProcBuf [j]);\n");
fprintf(fd," ymmRes1 = simde_mm256_cvtepi8_epi16(p_bnProcBuf [j +1]);\n");
// Loop over CNs
for (k=1; k<22; k++)
{
fprintf(fd," p_llrRes[i] = simde_mm256_adds_epi8(p_llrRes[i], p_bnProcBuf[%d + i]);\n",k*cnOffsetInGroup);
fprintf(fd," ymm0 = simde_mm256_cvtepi8_epi16(p_bnProcBuf[%d + j]);\n", k*cnOffsetInGroup);
fprintf(fd," ymmRes0 = simde_mm256_adds_epi16(ymmRes0, ymm0);\n");
fprintf(fd," ymm1 = simde_mm256_cvtepi8_epi16(p_bnProcBuf[%d + j +1]);\n", k*cnOffsetInGroup);
fprintf(fd, " ymmRes1 = simde_mm256_adds_epi16(ymmRes1, ymm1); \n");
}
// Add LLR from receiver input
fprintf(fd," p_llrRes[i] = simde_mm256_adds_epi8(p_llrRes[i], p_llrProcBuf[i]);\n");
fprintf(fd," ymm0 = simde_mm256_cvtepi8_epi16(p_llrProcBuf[j]);\n");
fprintf(fd," ymmRes0 = simde_mm256_adds_epi16(ymmRes0, ymm0);\n");
fprintf(fd," ymm1 = simde_mm256_cvtepi8_epi16(p_llrProcBuf[j +1 ]);\n");
fprintf(fd," ymmRes1 = simde_mm256_adds_epi16(ymmRes1, ymm1);\n");
// Pack results back to epi8
fprintf(fd," ymm0 = simde_mm256_packs_epi16(ymmRes0, ymmRes1);\n");
// ymm0 = [ymmRes1[255:128] ymmRes0[255:128] ymmRes1[127:0] ymmRes0[127:0]]
// p_llrRes = [ymmRes1[255:128] ymmRes1[127:0] ymmRes0[255:128] ymmRes0[127:0]]
fprintf(fd," p_llrRes[i] = simde_mm256_permute4x64_epi64(ymm0, 0xD8);\n");
fprintf(fd,"}\n");
}
// =====================================================================
// Process group with 23 CNs
// Process group with 13 CNs
fprintf(fd, "// Process group with <23 CNs \n");
......@@ -1001,24 +1348,38 @@ fprintf(fd, "// Process group with <23 CNs \n");
cnOffsetInGroup = (lut_numBnInBnGroups[22]*NR_LDPC_ZMAX)>>4;
// Set pointers to start of group 2
fprintf(fd," p_bnProcBuf = (simde__m256i*) &bnProcBuf [%d];\n",lut_startAddrBnGroups[idxBnGroup]);
fprintf(fd," p_llrProcBuf = (simde__m256i*) &llrProcBuf [%d];\n",lut_startAddrBnGroupsLlr[idxBnGroup]);
fprintf(fd," p_llrRes = (simde__m256i*) &llrRes [%d];\n",lut_startAddrBnGroupsLlr[idxBnGroup]);
// First 16 LLRs of first CN
fprintf(fd," p_bnProcBuf = (__m128i*) &bnProcBuf [%d];\n",lut_startAddrBnGroups[idxBnGroup]);
fprintf(fd," p_llrProcBuf = (__m128i*) &llrProcBuf [%d];\n",lut_startAddrBnGroupsLlr[idxBnGroup]);
fprintf(fd," p_llrRes = (__m256i*) &llrRes [%d];\n",lut_startAddrBnGroupsLlr[idxBnGroup]);
// Loop over BNs
fprintf(fd," for (int i=0;i<M;i++) {\n");
fprintf(fd," p_llrRes[i] = p_bnProcBuf [i];\n");
fprintf(fd," for (int i=0,j=0;i<M;i++,j+=2) {\n");
// First 16 LLRs of first CN
fprintf(fd," ymmRes0 = simde_mm256_cvtepi8_epi16(p_bnProcBuf [j]);\n");
fprintf(fd," ymmRes1 = simde_mm256_cvtepi8_epi16(p_bnProcBuf [j +1]);\n");
// Loop over CNs
for (k=1; k<23; k++)
{
fprintf(fd," p_llrRes[i] = simde_mm256_adds_epi8(p_llrRes[i], p_bnProcBuf[%d + i]);\n",k*cnOffsetInGroup);
fprintf(fd," ymm0 = simde_mm256_cvtepi8_epi16(p_bnProcBuf[%d + j]);\n", k*cnOffsetInGroup);
fprintf(fd," ymmRes0 = simde_mm256_adds_epi16(ymmRes0, ymm0);\n");
fprintf(fd," ymm1 = simde_mm256_cvtepi8_epi16(p_bnProcBuf[%d + j +1]);\n", k*cnOffsetInGroup);
fprintf(fd," ymmRes1 = simde_mm256_adds_epi16(ymmRes1, ymm1); \n");
}
// Add LLR from receiver input
fprintf(fd," p_llrRes[i] = simde_mm256_adds_epi8(p_llrRes[i], p_llrProcBuf[i]);\n");
fprintf(fd," ymm0 = simde_mm256_cvtepi8_epi16(p_llrProcBuf[j]);\n");
fprintf(fd," ymmRes0 = simde_mm256_adds_epi16(ymmRes0, ymm0);\n");
fprintf(fd," ymm1 = simde_mm256_cvtepi8_epi16(p_llrProcBuf[j +1 ]);\n");
fprintf(fd," ymmRes1 = simde_mm256_adds_epi16(ymmRes1, ymm1);\n");
// Pack results back to epi8
fprintf(fd," ymm0 = simde_mm256_packs_epi16(ymmRes0, ymmRes1);\n");
// ymm0 = [ymmRes1[255:128] ymmRes0[255:128] ymmRes1[127:0] ymmRes0[127:0]]
// p_llrRes = [ymmRes1[255:128] ymmRes1[127:0] ymmRes0[255:128] ymmRes0[127:0]]
fprintf(fd," p_llrRes[i] = simde_mm256_permute4x64_epi64(ymm0, 0xD8);\n");
fprintf(fd,"}\n");
}
......@@ -1026,11 +1387,11 @@ fprintf(fd, "// Process group with <23 CNs \n");
// =====================================================================
// Process group with 24 CNs
// Process group with 4 CNs
fprintf(fd, "// Process group with 24 CNs \n");
// Process group with 24 CNs
// Process group with 4 CNs
if (lut_numBnInBnGroups[23] > 0)
{
......@@ -1044,24 +1405,38 @@ fprintf(fd, "// Process group with 24 CNs \n");
cnOffsetInGroup = (lut_numBnInBnGroups[23]*NR_LDPC_ZMAX)>>4;
// Set pointers to start of group 2
fprintf(fd," p_bnProcBuf = (simde__m256i*) &bnProcBuf [%d];\n",lut_startAddrBnGroups[idxBnGroup]);
fprintf(fd," p_llrProcBuf = (simde__m256i*) &llrProcBuf [%d];\n",lut_startAddrBnGroupsLlr[idxBnGroup]);
fprintf(fd," p_llrRes = (simde__m256i*) &llrRes [%d];\n",lut_startAddrBnGroupsLlr[idxBnGroup]);
// First 16 LLRs of first CN
fprintf(fd," p_bnProcBuf = (__m128i*) &bnProcBuf [%d];\n",lut_startAddrBnGroups[idxBnGroup]);
fprintf(fd," p_llrProcBuf = (__m128i*) &llrProcBuf [%d];\n",lut_startAddrBnGroupsLlr[idxBnGroup]);
fprintf(fd," p_llrRes = (__m256i*) &llrRes [%d];\n",lut_startAddrBnGroupsLlr[idxBnGroup]);
// Loop over BNs
fprintf(fd," for (int i=0;i<M;i++) {\n");
fprintf(fd," p_llrRes[i] = p_bnProcBuf [i];\n");
fprintf(fd," for (int i=0,j=0;i<M;i++,j+=2) {\n");
// First 16 LLRs of first CN
fprintf(fd," ymmRes0 = simde_mm256_cvtepi8_epi16(p_bnProcBuf [j]);\n");
fprintf(fd," ymmRes1 = simde_mm256_cvtepi8_epi16(p_bnProcBuf [j +1]);\n");
// Loop over CNs
for (k=1; k<24; k++)
{
fprintf(fd," p_llrRes[i] = simde_mm256_adds_epi8(p_llrRes[i], p_bnProcBuf[%d + i]);\n",k*cnOffsetInGroup);
fprintf(fd," ymm0 = simde_mm256_cvtepi8_epi16(p_bnProcBuf[%d + j]);\n", k*cnOffsetInGroup);
fprintf(fd," ymmRes0 = simde_mm256_adds_epi16(ymmRes0, ymm0);\n");
fprintf(fd," ymm1 = simde_mm256_cvtepi8_epi16(p_bnProcBuf[%d + j +1]);\n", k*cnOffsetInGroup);
fprintf(fd, " ymmRes1 = simde_mm256_adds_epi16(ymmRes1, ymm1); \n");
}
// Add LLR from receiver input
fprintf(fd," p_llrRes[i] = simde_mm256_adds_epi8(p_llrRes[i], p_llrProcBuf[i]);\n");
fprintf(fd," ymm0 = simde_mm256_cvtepi8_epi16(p_llrProcBuf[j]);\n");
fprintf(fd," ymmRes0 = simde_mm256_adds_epi16(ymmRes0, ymm0);\n");
fprintf(fd," ymm1 = simde_mm256_cvtepi8_epi16(p_llrProcBuf[j +1 ]);\n");
fprintf(fd," ymmRes1 = simde_mm256_adds_epi16(ymmRes1, ymm1);\n");
// Pack results back to epi8
fprintf(fd," ymm0 = simde_mm256_packs_epi16(ymmRes0, ymmRes1);\n");
// ymm0 = [ymmRes1[255:128] ymmRes0[255:128] ymmRes1[127:0] ymmRes0[127:0]]
// p_llrRes = [ymmRes1[255:128] ymmRes1[127:0] ymmRes0[255:128] ymmRes0[127:0]]
fprintf(fd," p_llrRes[i] = simde_mm256_permute4x64_epi64(ymm0, 0xD8);\n");
fprintf(fd,"}\n");
}
......@@ -1086,24 +1461,38 @@ fprintf(fd, "// Process group with 25 CNs \n");
cnOffsetInGroup = (lut_numBnInBnGroups[24]*NR_LDPC_ZMAX)>>4;
// Set pointers to start of group 2
fprintf(fd," p_bnProcBuf = (simde__m256i*) &bnProcBuf [%d];\n",lut_startAddrBnGroups[idxBnGroup]);
fprintf(fd," p_llrProcBuf = (simde__m256i*) &llrProcBuf [%d];\n",lut_startAddrBnGroupsLlr[idxBnGroup]);
fprintf(fd," p_llrRes = (simde__m256i*) &llrRes [%d];\n",lut_startAddrBnGroupsLlr[idxBnGroup]);
// First 16 LLRs of first CN
fprintf(fd," p_bnProcBuf = (__m128i*) &bnProcBuf [%d];\n",lut_startAddrBnGroups[idxBnGroup]);
fprintf(fd," p_llrProcBuf = (__m128i*) &llrProcBuf [%d];\n",lut_startAddrBnGroupsLlr[idxBnGroup]);
fprintf(fd," p_llrRes = (__m256i*) &llrRes [%d];\n",lut_startAddrBnGroupsLlr[idxBnGroup]);
// Loop over BNs
fprintf(fd," for (int i=0;i<M;i++) {\n");
fprintf(fd," p_llrRes[i] = p_bnProcBuf [i];\n");
fprintf(fd," for (int i=0,j=0;i<M;i++,j+=2) {\n");
// First 16 LLRs of first CN
fprintf(fd," ymmRes0 = simde_mm256_cvtepi8_epi16(p_bnProcBuf [j]);\n");
fprintf(fd," ymmRes1 = simde_mm256_cvtepi8_epi16(p_bnProcBuf [j +1]);\n");
// Loop over CNs
for (k=1; k<25; k++)
{
fprintf(fd," p_llrRes[i] = simde_mm256_adds_epi8(p_llrRes[i], p_bnProcBuf[%d + i]);\n",k*cnOffsetInGroup);
fprintf(fd," ymm0 = simde_mm256_cvtepi8_epi16(p_bnProcBuf[%d + j]);\n", k*cnOffsetInGroup);
fprintf(fd," ymmRes0 = simde_mm256_adds_epi16(ymmRes0, ymm0);\n");
fprintf(fd," ymm1 = simde_mm256_cvtepi8_epi16(p_bnProcBuf[%d + j +1]);\n", k*cnOffsetInGroup);
fprintf(fd, " ymmRes1 = simde_mm256_adds_epi16(ymmRes1, ymm1); \n");
}
// Add LLR from receiver input
fprintf(fd," p_llrRes[i] = simde_mm256_adds_epi8(p_llrRes[i], p_llrProcBuf[i]);\n");
fprintf(fd," ymm0 = simde_mm256_cvtepi8_epi16(p_llrProcBuf[j]);\n");
fprintf(fd," ymmRes0 = simde_mm256_adds_epi16(ymmRes0, ymm0);\n");
fprintf(fd," ymm1 = simde_mm256_cvtepi8_epi16(p_llrProcBuf[j +1 ]);\n");
fprintf(fd," ymmRes1 = simde_mm256_adds_epi16(ymmRes1, ymm1);\n");
// Pack results back to epi8
fprintf(fd," ymm0 = simde_mm256_packs_epi16(ymmRes0, ymmRes1);\n");
// ymm0 = [ymmRes1[255:128] ymmRes0[255:128] ymmRes1[127:0] ymmRes0[127:0]]
// p_llrRes = [ymmRes1[255:128] ymmRes1[127:0] ymmRes0[255:128] ymmRes0[127:0]]
fprintf(fd," p_llrRes[i] = simde_mm256_permute4x64_epi64(ymm0, 0xD8);\n");
fprintf(fd,"}\n");
}
......@@ -1129,35 +1518,49 @@ fprintf(fd, "// Process group with 26 CNs \n");
cnOffsetInGroup = (lut_numBnInBnGroups[25]*NR_LDPC_ZMAX)>>4;
// Set pointers to start of group 2
fprintf(fd," p_bnProcBuf = (simde__m256i*) &bnProcBuf [%d];\n",lut_startAddrBnGroups[idxBnGroup]);
fprintf(fd," p_llrProcBuf = (simde__m256i*) &llrProcBuf [%d];\n",lut_startAddrBnGroupsLlr[idxBnGroup]);
fprintf(fd," p_llrRes = (simde__m256i*) &llrRes [%d];\n",lut_startAddrBnGroupsLlr[idxBnGroup]);
// First 16 LLRs of first CN
fprintf(fd," p_bnProcBuf = (__m128i*) &bnProcBuf [%d];\n",lut_startAddrBnGroups[idxBnGroup]);
fprintf(fd," p_llrProcBuf = (__m128i*) &llrProcBuf [%d];\n",lut_startAddrBnGroupsLlr[idxBnGroup]);
fprintf(fd," p_llrRes = (__m256i*) &llrRes [%d];\n",lut_startAddrBnGroupsLlr[idxBnGroup]);
// Loop over BNs
fprintf(fd," for (int i=0;i<M;i++) {\n");
fprintf(fd," p_llrRes[i] = p_bnProcBuf [i];\n");
fprintf(fd," for (int i=0,j=0;i<M;i++,j+=2) {\n");
// First 16 LLRs of first CN
fprintf(fd," ymmRes0 = simde_mm256_cvtepi8_epi16(p_bnProcBuf [j]);\n");
fprintf(fd," ymmRes1 = simde_mm256_cvtepi8_epi16(p_bnProcBuf [j +1]);\n");
// Loop over CNs
for (k=1; k<26; k++)
{
fprintf(fd," p_llrRes[i] = simde_mm256_adds_epi8(p_llrRes[i], p_bnProcBuf[%d + i]);\n",k*cnOffsetInGroup);
fprintf(fd," ymm0 = simde_mm256_cvtepi8_epi16(p_bnProcBuf[%d + j]);\n", k*cnOffsetInGroup);
fprintf(fd," ymmRes0 = simde_mm256_adds_epi16(ymmRes0, ymm0);\n");
fprintf(fd," ymm1 = simde_mm256_cvtepi8_epi16(p_bnProcBuf[%d + j +1]);\n", k*cnOffsetInGroup);
fprintf(fd, " ymmRes1 = simde_mm256_adds_epi16(ymmRes1, ymm1); \n");
}
// Add LLR from receiver input
fprintf(fd," p_llrRes[i] = simde_mm256_adds_epi8(p_llrRes[i], p_llrProcBuf[i]);\n");
fprintf(fd," ymm0 = simde_mm256_cvtepi8_epi16(p_llrProcBuf[j]);\n");
fprintf(fd," ymmRes0 = simde_mm256_adds_epi16(ymmRes0, ymm0);\n");
fprintf(fd," ymm1 = simde_mm256_cvtepi8_epi16(p_llrProcBuf[j +1 ]);\n");
fprintf(fd," ymmRes1 = simde_mm256_adds_epi16(ymmRes1, ymm1);\n");
// Pack results back to epi8
fprintf(fd," ymm0 = simde_mm256_packs_epi16(ymmRes0, ymmRes1);\n");
// ymm0 = [ymmRes1[255:128] ymmRes0[255:128] ymmRes1[127:0] ymmRes0[127:0]]
// p_llrRes = [ymmRes1[255:128] ymmRes1[127:0] ymmRes0[255:128] ymmRes0[127:0]]
fprintf(fd," p_llrRes[i] = simde_mm256_permute4x64_epi64(ymm0, 0xD8);\n");
fprintf(fd,"}\n");
}
// =====================================================================
// Process group with 27 CNs
// Process group with 17 CNs
fprintf(fd, "// Process group with 27 CNs \n");
// Process group with 27 CNs
// Process group with 17 CNs
if (lut_numBnInBnGroups[26] > 0)
{
......@@ -1171,35 +1574,49 @@ fprintf(fd, "// Process group with 27 CNs \n");
cnOffsetInGroup = (lut_numBnInBnGroups[26]*NR_LDPC_ZMAX)>>4;
// Set pointers to start of group 2
fprintf(fd," p_bnProcBuf = (simde__m256i*) &bnProcBuf [%d];\n",lut_startAddrBnGroups[idxBnGroup]);
fprintf(fd," p_llrProcBuf = (simde__m256i*) &llrProcBuf [%d];\n",lut_startAddrBnGroupsLlr[idxBnGroup]);
fprintf(fd," p_llrRes = (simde__m256i*) &llrRes [%d];\n",lut_startAddrBnGroupsLlr[idxBnGroup]);
// First 16 LLRs of first CN
fprintf(fd," p_bnProcBuf = (__m128i*) &bnProcBuf [%d];\n",lut_startAddrBnGroups[idxBnGroup]);
fprintf(fd," p_llrProcBuf = (__m128i*) &llrProcBuf [%d];\n",lut_startAddrBnGroupsLlr[idxBnGroup]);
fprintf(fd," p_llrRes = (__m256i*) &llrRes [%d];\n",lut_startAddrBnGroupsLlr[idxBnGroup]);
// Loop over BNs
fprintf(fd," for (int i=0;i<M;i++) {\n");
fprintf(fd," p_llrRes[i] = p_bnProcBuf [i];\n");
fprintf(fd," for (int i=0,j=0;i<M;i++,j+=2) {\n");
// First 16 LLRs of first CN
fprintf(fd," ymmRes0 = simde_mm256_cvtepi8_epi16(p_bnProcBuf [j]);\n");
fprintf(fd," ymmRes1 = simde_mm256_cvtepi8_epi16(p_bnProcBuf [j +1]);\n");
// Loop over CNs
for (k=1; k<27; k++)
{
fprintf(fd," p_llrRes[i] = simde_mm256_adds_epi8(p_llrRes[i], p_bnProcBuf[%d + i]);\n",k*cnOffsetInGroup);
fprintf(fd," ymm0 = simde_mm256_cvtepi8_epi16(p_bnProcBuf[%d + j]);\n", k*cnOffsetInGroup);
fprintf(fd," ymmRes0 = simde_mm256_adds_epi16(ymmRes0, ymm0);\n");
fprintf(fd," ymm1 = simde_mm256_cvtepi8_epi16(p_bnProcBuf[%d + j +1]);\n", k*cnOffsetInGroup);
fprintf(fd, " ymmRes1 = simde_mm256_adds_epi16(ymmRes1, ymm1); \n");
}
// Add LLR from receiver input
fprintf(fd," p_llrRes[i] = simde_mm256_adds_epi8(p_llrRes[i], p_llrProcBuf[i]);\n");
fprintf(fd," ymm0 = simde_mm256_cvtepi8_epi16(p_llrProcBuf[j]);\n");
fprintf(fd," ymmRes0 = simde_mm256_adds_epi16(ymmRes0, ymm0);\n");
fprintf(fd," ymm1 = simde_mm256_cvtepi8_epi16(p_llrProcBuf[j +1 ]);\n");
fprintf(fd," ymmRes1 = simde_mm256_adds_epi16(ymmRes1, ymm1);\n");
// Pack results back to epi8
fprintf(fd," ymm0 = simde_mm256_packs_epi16(ymmRes0, ymmRes1);\n");
// ymm0 = [ymmRes1[255:128] ymmRes0[255:128] ymmRes1[127:0] ymmRes0[127:0]]
// p_llrRes = [ymmRes1[255:128] ymmRes1[127:0] ymmRes0[255:128] ymmRes0[127:0]]
fprintf(fd," p_llrRes[i] = simde_mm256_permute4x64_epi64(ymm0, 0xD8);\n");
fprintf(fd,"}\n");
}
// =====================================================================
// Process group with 28 CNs
// Process group with 18 CNs
fprintf(fd, "// Process group with 28 CNs \n");
// Process group with 28 CNs
// Process group with 8 CNs
if (lut_numBnInBnGroups[27] > 0)
{
......@@ -1213,34 +1630,48 @@ fprintf(fd, "// Process group with 28 CNs \n");
cnOffsetInGroup = (lut_numBnInBnGroups[27]*NR_LDPC_ZMAX)>>4;
// Set pointers to start of group 2
fprintf(fd," p_bnProcBuf = (simde__m256i*) &bnProcBuf [%d];\n",lut_startAddrBnGroups[idxBnGroup]);
fprintf(fd," p_llrProcBuf = (simde__m256i*) &llrProcBuf [%d];\n",lut_startAddrBnGroupsLlr[idxBnGroup]);
fprintf(fd," p_llrRes = (simde__m256i*) &llrRes [%d];\n",lut_startAddrBnGroupsLlr[idxBnGroup]);
// First 16 LLRs of first CN
fprintf(fd," p_bnProcBuf = (__m128i*) &bnProcBuf [%d];\n",lut_startAddrBnGroups[idxBnGroup]);
fprintf(fd," p_llrProcBuf = (__m128i*) &llrProcBuf [%d];\n",lut_startAddrBnGroupsLlr[idxBnGroup]);
fprintf(fd," p_llrRes = (__m256i*) &llrRes [%d];\n",lut_startAddrBnGroupsLlr[idxBnGroup]);
// Loop over BNs
fprintf(fd," for (int i=0;i<M;i++) {\n");
fprintf(fd," p_llrRes[i] = p_bnProcBuf [i];\n");
fprintf(fd," for (int i=0,j=0;i<M;i++,j+=2) {\n");
// First 16 LLRs of first CN
fprintf(fd," ymmRes0 = simde_mm256_cvtepi8_epi16(p_bnProcBuf [j]);\n");
fprintf(fd," ymmRes1 = simde_mm256_cvtepi8_epi16(p_bnProcBuf [j +1]);\n");
// Loop over CNs
for (k=1; k<28; k++)
{
fprintf(fd," p_llrRes[i] = simde_mm256_adds_epi8(p_llrRes[i], p_bnProcBuf[%d + i]);\n",k*cnOffsetInGroup);
fprintf(fd," ymm0 = simde_mm256_cvtepi8_epi16(p_bnProcBuf[%d + j]);\n", k*cnOffsetInGroup);
fprintf(fd," ymmRes0 = simde_mm256_adds_epi16(ymmRes0, ymm0);\n");
fprintf(fd," ymm1 = simde_mm256_cvtepi8_epi16(p_bnProcBuf[%d + j +1]);\n", k*cnOffsetInGroup);
fprintf(fd, " ymmRes1 = simde_mm256_adds_epi16(ymmRes1, ymm1); \n");
}
// Add LLR from receiver input
fprintf(fd," p_llrRes[i] = simde_mm256_adds_epi8(p_llrRes[i], p_llrProcBuf[i]);\n");
fprintf(fd," ymm0 = simde_mm256_cvtepi8_epi16(p_llrProcBuf[j]);\n");
fprintf(fd," ymmRes0 = simde_mm256_adds_epi16(ymmRes0, ymm0);\n");
fprintf(fd," ymm1 = simde_mm256_cvtepi8_epi16(p_llrProcBuf[j +1 ]);\n");
fprintf(fd," ymmRes1 = simde_mm256_adds_epi16(ymmRes1, ymm1);\n");
// Pack results back to epi8
fprintf(fd," ymm0 = simde_mm256_packs_epi16(ymmRes0, ymmRes1);\n");
// ymm0 = [ymmRes1[255:128] ymmRes0[255:128] ymmRes1[127:0] ymmRes0[127:0]]
// p_llrRes = [ymmRes1[255:128] ymmRes1[127:0] ymmRes0[255:128] ymmRes0[127:0]]
fprintf(fd," p_llrRes[i] = simde_mm256_permute4x64_epi64(ymm0, 0xD8);\n");
fprintf(fd,"}\n");
}
// =====================================================================
// Process group with 29 CNs
// Process group with 9 CNs
fprintf(fd, "// Process group with 29 CNs \n");
// Process group with 29 CNs
// Process group with 9 CNs
if (lut_numBnInBnGroups[28] > 0)
{
......@@ -1254,35 +1685,49 @@ fprintf(fd, "// Process group with 29 CNs \n");
cnOffsetInGroup = (lut_numBnInBnGroups[28]*NR_LDPC_ZMAX)>>4;
// Set pointers to start of group 2
fprintf(fd," p_bnProcBuf = (simde__m256i*) &bnProcBuf [%d];\n",lut_startAddrBnGroups[idxBnGroup]);
fprintf(fd," p_llrProcBuf = (simde__m256i*) &llrProcBuf [%d];\n",lut_startAddrBnGroupsLlr[idxBnGroup]);
fprintf(fd," p_llrRes = (simde__m256i*) &llrRes [%d];\n",lut_startAddrBnGroupsLlr[idxBnGroup]);
// First 16 LLRs of first CN
fprintf(fd," p_bnProcBuf = (__m128i*) &bnProcBuf [%d];\n",lut_startAddrBnGroups[idxBnGroup]);
fprintf(fd," p_llrProcBuf = (__m128i*) &llrProcBuf [%d];\n",lut_startAddrBnGroupsLlr[idxBnGroup]);
fprintf(fd," p_llrRes = (__m256i*) &llrRes [%d];\n",lut_startAddrBnGroupsLlr[idxBnGroup]);
// Loop over BNs
fprintf(fd," for (int i=0;i<M;i++) {\n");
fprintf(fd," p_llrRes[i] = p_bnProcBuf [i];\n");
fprintf(fd," for (int i=0,j=0;i<M;i++,j+=2) {\n");
// First 16 LLRs of first CN
fprintf(fd," ymmRes0 = simde_mm256_cvtepi8_epi16(p_bnProcBuf [j]);\n");
fprintf(fd," ymmRes1 = simde_mm256_cvtepi8_epi16(p_bnProcBuf [j +1]);\n");
// Loop over CNs
for (k=1; k<29; k++)
{
fprintf(fd," p_llrRes[i] = simde_mm256_adds_epi8(p_llrRes[i], p_bnProcBuf[%d + i]);\n",k*cnOffsetInGroup);
fprintf(fd," ymm0 = simde_mm256_cvtepi8_epi16(p_bnProcBuf[%d + j]);\n", k*cnOffsetInGroup);
fprintf(fd," ymmRes0 = simde_mm256_adds_epi16(ymmRes0, ymm0);\n");
fprintf(fd," ymm1 = simde_mm256_cvtepi8_epi16(p_bnProcBuf[%d + j +1]);\n", k*cnOffsetInGroup);
fprintf(fd, " ymmRes1 = simde_mm256_adds_epi16(ymmRes1, ymm1); \n");
}
// Add LLR from receiver input
fprintf(fd," p_llrRes[i] = simde_mm256_adds_epi8(p_llrRes[i], p_llrProcBuf[i]);\n");
fprintf(fd," ymm0 = simde_mm256_cvtepi8_epi16(p_llrProcBuf[j]);\n");
fprintf(fd," ymmRes0 = simde_mm256_adds_epi16(ymmRes0, ymm0);\n");
fprintf(fd," ymm1 = simde_mm256_cvtepi8_epi16(p_llrProcBuf[j +1 ]);\n");
fprintf(fd," ymmRes1 = simde_mm256_adds_epi16(ymmRes1, ymm1);\n");
// Pack results back to epi8
fprintf(fd," ymm0 = simde_mm256_packs_epi16(ymmRes0, ymmRes1);\n");
// ymm0 = [ymmRes1[255:128] ymmRes0[255:128] ymmRes1[127:0] ymmRes0[127:0]]
// p_llrRes = [ymmRes1[255:128] ymmRes1[127:0] ymmRes0[255:128] ymmRes0[127:0]]
fprintf(fd," p_llrRes[i] = simde_mm256_permute4x64_epi64(ymm0, 0xD8);\n");
fprintf(fd,"}\n");
}
// =====================================================================
// Process group with 30 CNs
// Process group with 20 CNs
fprintf(fd, "// Process group with 30 CNs \n");
// Process group with 30 CNs
// Process group with 20 CNs
if (lut_numBnInBnGroups[29] > 0)
{
......@@ -1296,24 +1741,38 @@ fprintf(fd, "// Process group with 30 CNs \n");
cnOffsetInGroup = (lut_numBnInBnGroups[29]*NR_LDPC_ZMAX)>>4;
// Set pointers to start of group 2
fprintf(fd," p_bnProcBuf = (simde__m256i*) &bnProcBuf [%d];\n",lut_startAddrBnGroups[idxBnGroup]);
fprintf(fd," p_llrProcBuf = (simde__m256i*) &llrProcBuf [%d];\n",lut_startAddrBnGroupsLlr[idxBnGroup]);
fprintf(fd," p_llrRes = (simde__m256i*) &llrRes [%d];\n",lut_startAddrBnGroupsLlr[idxBnGroup]);
// First 16 LLRs of first CN
fprintf(fd," p_bnProcBuf = (__m128i*) &bnProcBuf [%d];\n",lut_startAddrBnGroups[idxBnGroup]);
fprintf(fd," p_llrProcBuf = (__m128i*) &llrProcBuf [%d];\n",lut_startAddrBnGroupsLlr[idxBnGroup]);
fprintf(fd," p_llrRes = (__m256i*) &llrRes [%d];\n",lut_startAddrBnGroupsLlr[idxBnGroup]);
// Loop over BNs
fprintf(fd," for (int i=0;i<M;i++) {\n");
fprintf(fd," p_llrRes[i] = p_bnProcBuf [i];\n");
fprintf(fd," for (int i=0,j=0;i<M;i++,j+=2) {\n");
// First 16 LLRs of first CN
fprintf(fd," ymmRes0 = simde_mm256_cvtepi8_epi16(p_bnProcBuf [j]);\n");
fprintf(fd," ymmRes1 = simde_mm256_cvtepi8_epi16(p_bnProcBuf [j +1]);\n");
// Loop over CNs
for (k=1; k<30; k++)
{
fprintf(fd," p_llrRes[i] = simde_mm256_adds_epi8(p_llrRes[i], p_bnProcBuf[%d + i]);\n",k*cnOffsetInGroup);
fprintf(fd," ymm0 = simde_mm256_cvtepi8_epi16(p_bnProcBuf[%d + j]);\n", k*cnOffsetInGroup);
fprintf(fd," ymmRes0 = simde_mm256_adds_epi16(ymmRes0, ymm0);\n");
fprintf(fd," ymm1 = simde_mm256_cvtepi8_epi16(p_bnProcBuf[%d + j +1]);\n", k*cnOffsetInGroup);
fprintf(fd, " ymmRes1 = simde_mm256_adds_epi16(ymmRes1, ymm1); \n");
}
// Add LLR from receiver input
fprintf(fd," p_llrRes[i] = simde_mm256_adds_epi8(p_llrRes[i], p_llrProcBuf[i]);\n");
fprintf(fd," ymm0 = simde_mm256_cvtepi8_epi16(p_llrProcBuf[j]);\n");
fprintf(fd," ymmRes0 = simde_mm256_adds_epi16(ymmRes0, ymm0);\n");
fprintf(fd," ymm1 = simde_mm256_cvtepi8_epi16(p_llrProcBuf[j +1 ]);\n");
fprintf(fd," ymmRes1 = simde_mm256_adds_epi16(ymmRes1, ymm1);\n");
// Pack results back to epi8
fprintf(fd," ymm0 = simde_mm256_packs_epi16(ymmRes0, ymmRes1);\n");
// ymm0 = [ymmRes1[255:128] ymmRes0[255:128] ymmRes1[127:0] ymmRes0[127:0]]
// p_llrRes = [ymmRes1[255:128] ymmRes1[127:0] ymmRes0[255:128] ymmRes0[127:0]]
fprintf(fd," p_llrRes[i] = simde_mm256_permute4x64_epi64(ymm0, 0xD8);\n");
fprintf(fd,"}\n");
}
......
Markdown is supported
0%
or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment