Commit c8a787d2 authored by Sy's avatar Sy

optimized 5G NR LDPC decoder

parent 559e8fdc
...@@ -71,7 +71,7 @@ fprintf(fd, "// Process group with 1 CNs \n"); ...@@ -71,7 +71,7 @@ fprintf(fd, "// Process group with 1 CNs \n");
// Process group with 2 CNs // Process group with 2 CNs
/*
if (lut_numBnInBnGroups[0] > 0) if (lut_numBnInBnGroups[0] > 0)
{ {
// If elements in group move to next address // If elements in group move to next address
...@@ -123,7 +123,7 @@ fprintf(fd, "// Process group with 1 CNs \n"); ...@@ -123,7 +123,7 @@ fprintf(fd, "// Process group with 1 CNs \n");
} }
// ===================================================================== // =====================================================================
// Process group with 2 CNs // Process group with 2 CNs
*/
fprintf(fd, "// Process group with 2 CNs \n"); fprintf(fd, "// Process group with 2 CNs \n");
......
...@@ -4,8 +4,8 @@ ...@@ -4,8 +4,8 @@
#define NB_R 3 #define NB_R 3
void nrLDPC_bnProc_BG1_generator_AVX512(int); void nrLDPC_bnProc_BG1_generator_AVX512(int);
void nrLDPC_bnProc_BG2_generator_AVX512(int); void nrLDPC_bnProc_BG2_generator_AVX512(int);
//void nrLDPC_bnProcPc_BG1_generator_AVX2(int); void nrLDPC_bnProcPc_BG1_generator_AVX512(int);
//void nrLDPC_bnProcPc_BG2_generator_AVX2(int); void nrLDPC_bnProcPc_BG2_generator_AVX512(int);
int main() int main()
{ {
...@@ -15,8 +15,8 @@ int main() ...@@ -15,8 +15,8 @@ int main()
nrLDPC_bnProc_BG1_generator_AVX512(R[i]); nrLDPC_bnProc_BG1_generator_AVX512(R[i]);
nrLDPC_bnProc_BG2_generator_AVX512(R[i]); nrLDPC_bnProc_BG2_generator_AVX512(R[i]);
// nrLDPC_bnProcPc_BG1_generator_AVX2(R[i]); nrLDPC_bnProcPc_BG1_generator_AVX512(R[i]);
// nrLDPC_bnProcPc_BG2_generator_AVX2(R[i]); nrLDPC_bnProcPc_BG2_generator_AVX512(R[i]);
} }
......
#include <stdint.h>
#include <immintrin.h>
void nrLDPC_bnProc_BG2_R23_AVX2(int8_t* bnProcBuf,int8_t* bnProcBufRes, int8_t* llrRes, uint16_t Z ) {
__m256i* p_bnProcBuf;
__m256i* p_bnProcBufRes;
__m256i* p_llrRes;
__m256i* p_res;
uint32_t M, i;
// Process group with 2 CNs
M = (3*Z + 31)>>5;
p_bnProcBuf = (__m256i*) &bnProcBuf [1152];
p_bnProcBufRes = (__m256i*) &bnProcBufRes [1152];
p_res = &p_bnProcBufRes[0];
p_llrRes = (__m256i*) &llrRes [1152];
for (i=0;i<M;i++) {
p_res[i] = _mm256_subs_epi8(p_llrRes[i], p_bnProcBuf[0 + i]);
}
p_res = &p_bnProcBufRes[36];
p_llrRes = (__m256i*) &llrRes [1152];
for (i=0;i<M;i++) {
p_res[i] = _mm256_subs_epi8(p_llrRes[i], p_bnProcBuf[36 + i]);
}
// Process group with 3 CNs
M = (5*Z + 31)>>5;
p_bnProcBuf = (__m256i*) &bnProcBuf [3456];
p_bnProcBufRes = (__m256i*) &bnProcBufRes [3456];
p_res = &p_bnProcBufRes[0];
p_llrRes = (__m256i*) &llrRes [2304];
for (i=0;i<M;i++) {
p_res[i] = _mm256_subs_epi8(p_llrRes[i], p_bnProcBuf[0 + i]);
}
p_res = &p_bnProcBufRes[60];
p_llrRes = (__m256i*) &llrRes [2304];
for (i=0;i<M;i++) {
p_res[i] = _mm256_subs_epi8(p_llrRes[i], p_bnProcBuf[60 + i]);
}
p_res = &p_bnProcBufRes[120];
p_llrRes = (__m256i*) &llrRes [2304];
for (i=0;i<M;i++) {
p_res[i] = _mm256_subs_epi8(p_llrRes[i], p_bnProcBuf[120 + i]);
}
// Process group with 4 CNs
M = (3*Z + 31)>>5;
p_bnProcBuf = (__m256i*) &bnProcBuf [9216];
p_bnProcBufRes = (__m256i*) &bnProcBufRes [9216];
p_res = &p_bnProcBufRes[0];
p_llrRes = (__m256i*) &llrRes [4224];
for (i=0;i<M;i++) {
p_res[i] = _mm256_subs_epi8(p_llrRes[i], p_bnProcBuf[0 + i]);
}
p_res = &p_bnProcBufRes[36];
p_llrRes = (__m256i*) &llrRes [4224];
for (i=0;i<M;i++) {
p_res[i] = _mm256_subs_epi8(p_llrRes[i], p_bnProcBuf[36 + i]);
}
p_res = &p_bnProcBufRes[72];
p_llrRes = (__m256i*) &llrRes [4224];
for (i=0;i<M;i++) {
p_res[i] = _mm256_subs_epi8(p_llrRes[i], p_bnProcBuf[72 + i]);
}
p_res = &p_bnProcBufRes[108];
p_llrRes = (__m256i*) &llrRes [4224];
for (i=0;i<M;i++) {
p_res[i] = _mm256_subs_epi8(p_llrRes[i], p_bnProcBuf[108 + i]);
}
// Process group with 5 CNs
M = (2*Z + 31)>>5;
p_bnProcBuf = (__m256i*) &bnProcBuf [13824];
p_bnProcBufRes = (__m256i*) &bnProcBufRes [13824];
p_res = &p_bnProcBufRes[0];
p_llrRes = (__m256i*) &llrRes [5376];
for (i=0;i<M;i++) {
p_res[i] = _mm256_subs_epi8(p_llrRes[i], p_bnProcBuf[0 + i]);
}
p_res = &p_bnProcBufRes[24];
p_llrRes = (__m256i*) &llrRes [5376];
for (i=0;i<M;i++) {
p_res[i] = _mm256_subs_epi8(p_llrRes[i], p_bnProcBuf[24 + i]);
}
p_res = &p_bnProcBufRes[48];
p_llrRes = (__m256i*) &llrRes [5376];
for (i=0;i<M;i++) {
p_res[i] = _mm256_subs_epi8(p_llrRes[i], p_bnProcBuf[48 + i]);
}
p_res = &p_bnProcBufRes[72];
p_llrRes = (__m256i*) &llrRes [5376];
for (i=0;i<M;i++) {
p_res[i] = _mm256_subs_epi8(p_llrRes[i], p_bnProcBuf[72 + i]);
}
p_res = &p_bnProcBufRes[96];
p_llrRes = (__m256i*) &llrRes [5376];
for (i=0;i<M;i++) {
p_res[i] = _mm256_subs_epi8(p_llrRes[i], p_bnProcBuf[96 + i]);
}
// Process group with 6 CNs
M = (1*Z + 31)>>5;
p_bnProcBuf = (__m256i*) &bnProcBuf [17664];
p_bnProcBufRes = (__m256i*) &bnProcBufRes [17664];
p_res = &p_bnProcBufRes[0];
p_llrRes = (__m256i*) &llrRes [6144];
for (i=0;i<M;i++) {
p_res[i] = _mm256_subs_epi8(p_llrRes[i], p_bnProcBuf[0 + i]);
}
p_res = &p_bnProcBufRes[12];
p_llrRes = (__m256i*) &llrRes [6144];
for (i=0;i<M;i++) {
p_res[i] = _mm256_subs_epi8(p_llrRes[i], p_bnProcBuf[12 + i]);
}
p_res = &p_bnProcBufRes[24];
p_llrRes = (__m256i*) &llrRes [6144];
for (i=0;i<M;i++) {
p_res[i] = _mm256_subs_epi8(p_llrRes[i], p_bnProcBuf[24 + i]);
}
p_res = &p_bnProcBufRes[36];
p_llrRes = (__m256i*) &llrRes [6144];
for (i=0;i<M;i++) {
p_res[i] = _mm256_subs_epi8(p_llrRes[i], p_bnProcBuf[36 + i]);
}
p_res = &p_bnProcBufRes[48];
p_llrRes = (__m256i*) &llrRes [6144];
for (i=0;i<M;i++) {
p_res[i] = _mm256_subs_epi8(p_llrRes[i], p_bnProcBuf[48 + i]);
}
p_res = &p_bnProcBufRes[60];
p_llrRes = (__m256i*) &llrRes [6144];
for (i=0;i<M;i++) {
p_res[i] = _mm256_subs_epi8(p_llrRes[i], p_bnProcBuf[60 + i]);
}
// Process group with 7 CNs
// Process group with 8 CNs
// Process group with 9 CNs
// Process group with 10 CNs
// Process group with 11 CNs
// Process group with 12 CNs
// Process group with 13 CNs
// Process group with 14 CNs
// Process group with 15 CNs
// Process group with 16 CNs
// Process group with 17 CNs
// Process group with 18 CNs
// Process group with 19 CNs
// Process group with 20 CNs
// Process group with 21 CNs
// Process group with 22 CNs
// Process group with <23 CNs
// Process group with 24 CNs
// Process group with 25 CNs
// Process group with 26 CNs
// Process group with 27 CNs
// Process group with 28 CNs
// Process group with 29 CNs
// Process group with 30 CNs
}
...@@ -5,20 +5,6 @@ static inline void nrLDPC_bnProcPc_BG1_R13_AVX2(int8_t* bnProcBuf,int8_t* llrRes ...@@ -5,20 +5,6 @@ static inline void nrLDPC_bnProcPc_BG1_R13_AVX2(int8_t* bnProcBuf,int8_t* llrRes
__m256i* p_llrRes; __m256i* p_llrRes;
uint32_t M ; uint32_t M ;
// Process group with 1 CNs // Process group with 1 CNs
M = (42*Z + 31)>>5;
p_bnProcBuf = (__m128i*) &bnProcBuf [0];
p_llrProcBuf = (__m128i*) &llrProcBuf [0];
p_llrRes = (__m256i*) &llrRes [0];
for (int i=0,j=0;i<M;i++,j+=2) {
ymm0 = _mm256_cvtepi8_epi16(p_bnProcBuf [j]);
ymm1 = _mm256_cvtepi8_epi16(p_bnProcBuf[j + 1]);
ymm0 = _mm256_cvtepi8_epi16(p_llrProcBuf[j]);
ymmRes0 = _mm256_adds_epi16(ymmRes0, ymm0);
ymm1 = _mm256_cvtepi8_epi16(p_llrProcBuf[j +1 ]);
ymmRes1 = _mm256_adds_epi16(ymmRes1, ymm1);
ymm0 = _mm256_packs_epi16(ymmRes0, ymmRes1);
p_llrRes[i] = _mm256_permute4x64_epi64(ymm0, 0xD8);
}
// Process group with 2 CNs // Process group with 2 CNs
// Process group with 3 CNs // Process group with 3 CNs
// Process group with 4 CNs // Process group with 4 CNs
......
Markdown is supported
0%
or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment