Commit 559e8fdc authored by Sy's avatar Sy

degradation of performance in BLER fixed | bnProc & bnProcPc unrolled |...

degradation of performance in   BLER fixed | bnProc & bnProcPc unrolled | small improvement in times
parent 0f704894
C=gcc
CFLAGS=-W -Wall -mavx2
LDFLAGS=
EXEC=bnProc_gen_avx512
SRC= $(wildcard *.c)
OBJ= $(SRC:.c=.o)
all: $(EXEC)
bnProc_gen_avx512: $(OBJ)
@$(CC) -o $@ $^ $(LDFLAGS) -O2
%.o: %.c
@$(CC) -o $@ -c $< $(CFLAGS) -I ${OPENAIR_HOME}/openair1 -g -std=c99
.PHONY: clean mrproper
clean:
@rm -rf *.o
mrproper: clean
@rm -rf $(EXEC)
zip:
@tar -zcvf sauvegarde.tar.gz main.c bnProc_gen_BG1_avx512.c bnProc_gen_BG2_avx512.c Makefile
#include <stdio.h>
#include <stdint.h>
#define NB_R 3
void nrLDPC_bnProc_BG1_generator_AVX512(int);
void nrLDPC_bnProc_BG2_generator_AVX512(int);
//void nrLDPC_bnProcPc_BG1_generator_AVX2(int);
//void nrLDPC_bnProcPc_BG2_generator_AVX2(int);
int main()
{
int R[NB_R]={0,1,2};
for(int i=0; i<NB_R;i++){
nrLDPC_bnProc_BG1_generator_AVX512(R[i]);
nrLDPC_bnProc_BG2_generator_AVX512(R[i]);
// nrLDPC_bnProcPc_BG1_generator_AVX2(R[i]);
// nrLDPC_bnProcPc_BG2_generator_AVX2(R[i]);
}
return(0);
}
static inline void nrLDPC_bnProcPc_BG2_R89_AVX2(int8_t* bnProcBuf,int8_t* bnProcBufRes, int8_t* llrRes , int8_t* llrProcBuf, uint16_t Z ) {
__m256i ymm0, ymm1, ymmRes0, ymmRes1;
__m128i* p_bnProcBuf;
__m256i* p_bnProcBufRes;
__m128i* p_llrProcBuf;
__m256i* p_llrProcBuf256;
__m256i* p_llrRes;
uint32_t M ;
// Process group with 1 CNs
M = (0*Z + 31)>>5;
p_bnProcBuf = (__m128i*) &bnProcBuf [0];
p_bnProcBufRes = (__m256i*) &bnProcBufRes [0];
p_llrProcBuf = (__m128i*) &llrProcBuf [0];
p_llrProcBuf256 = (__m256i*) &llrProcBuf [0];
p_llrRes = (__m256i*) &llrRes [0];
for (int i=0,j=0;i<M;i++,j+=2) {
p_bnProcBufRes[i] = p_llrProcBuf256[i];
ymm0 = _mm256_cvtepi8_epi16(p_bnProcBuf [j]);
ymm1 = _mm256_cvtepi8_epi16(p_llrProcBuf[j]);
ymmRes0 = _mm256_adds_epi16(ymm0, ymm1);
ymm0 = _mm256_cvtepi8_epi16(p_bnProcBuf [j+1]);
ymm1 = _mm256_cvtepi8_epi16(p_llrProcBuf[j+1]);
ymmRes1 = _mm256_adds_epi16(ymm0, ymm1);
ymm0 = _mm256_packs_epi16(ymmRes0, ymmRes1);
*p_llrRes = _mm256_permute4x64_epi64(ymm0, 0xD8);
p_llrRes++;
}
// Process group with 2 CNs
M = (3*Z + 31)>>5;
p_bnProcBuf = (__m128i*) &bnProcBuf [1152];
p_llrProcBuf = (__m128i*) &llrProcBuf [1152];
p_llrRes = (__m256i*) &llrRes [1152];
for (int i=0,j=0;i<M;i++,j+=2) {
ymm0 = _mm256_cvtepi8_epi16(p_bnProcBuf [j]);
ymm1 = _mm256_cvtepi8_epi16(p_llrProcBuf[j]);
ymm0 = _mm256_cvtepi8_epi16(p_bnProcBuf[72 + j]);
ymmRes0 = _mm256_adds_epi16(ymmRes0, ymm0);
ymm1 = _mm256_cvtepi8_epi16(p_bnProcBuf[72 + j +1]);
ymmRes1 = _mm256_adds_epi16(ymmRes1, ymm1);
ymm0 = _mm256_cvtepi8_epi16(p_llrProcBuf[j]);
ymmRes0 = _mm256_adds_epi16(ymmRes0, ymm0);
ymm1 = _mm256_cvtepi8_epi16(p_llrProcBuf[j +1 ]);
ymmRes1 = _mm256_adds_epi16(ymmRes1, ymm1);
ymm0 = _mm256_packs_epi16(ymmRes0, ymmRes1);
*p_llrRes = _mm256_permute4x64_epi64(ymm0, 0xD8);
p_llrRes++;
}
// Process group with 3 CNs
M = (5*Z + 31)>>5;
p_bnProcBuf = (__m128i*) &bnProcBuf [3456];
p_llrProcBuf = (__m128i*) &llrProcBuf [2304];
p_llrRes = (__m256i*) &llrRes [2304];
for (int i=0,j=0;i<M;i++,j+=2) {
ymmRes0 = _mm256_cvtepi8_epi16(p_bnProcBuf [j]);
ymmRes1 = _mm256_cvtepi8_epi16(p_bnProcBuf [j +1]);
ymm0 = _mm256_cvtepi8_epi16(p_bnProcBuf[120 + j]);
ymmRes0 = _mm256_adds_epi16(ymmRes0, ymm0);
ymm1 = _mm256_cvtepi8_epi16(p_bnProcBuf[120 + j +1]);
ymmRes1 = _mm256_adds_epi16(ymmRes1, ymm1);
ymm0 = _mm256_cvtepi8_epi16(p_bnProcBuf[240 + j]);
ymmRes0 = _mm256_adds_epi16(ymmRes0, ymm0);
ymm1 = _mm256_cvtepi8_epi16(p_bnProcBuf[240 + j +1]);
ymmRes1 = _mm256_adds_epi16(ymmRes1, ymm1);
ymm0 = _mm256_cvtepi8_epi16(p_llrProcBuf[j]);
ymmRes0 = _mm256_adds_epi16(ymmRes0, ymm0);
ymm1 = _mm256_cvtepi8_epi16(p_llrProcBuf[j +1 ]);
ymmRes1 = _mm256_adds_epi16(ymmRes1, ymm1);
ymm0 = _mm256_packs_epi16(ymmRes0, ymmRes1);
*p_llrRes = _mm256_permute4x64_epi64(ymm0, 0xD8);
p_llrRes++;
}
// Process group with 4 CNs
M = (3*Z + 31)>>5;
p_bnProcBuf = (__m128i*) &bnProcBuf [9216];
p_llrProcBuf = (__m128i*) &llrProcBuf [4224];
p_llrRes = (__m256i*) &llrRes [4224];
for (int i=0,j=0;i<M;i++,j+=2) {
ymmRes0 = _mm256_cvtepi8_epi16(p_bnProcBuf [j]);
ymmRes1 = _mm256_cvtepi8_epi16(p_bnProcBuf [j +1]);
ymm0 = _mm256_cvtepi8_epi16(p_bnProcBuf[72 + j]);
ymmRes0 = _mm256_adds_epi16(ymmRes0, ymm0);
ymm1 = _mm256_cvtepi8_epi16(p_bnProcBuf[72 + j +1]);
ymmRes1 = _mm256_adds_epi16(ymmRes1, ymm1);
ymm0 = _mm256_cvtepi8_epi16(p_bnProcBuf[144 + j]);
ymmRes0 = _mm256_adds_epi16(ymmRes0, ymm0);
ymm1 = _mm256_cvtepi8_epi16(p_bnProcBuf[144 + j +1]);
ymmRes1 = _mm256_adds_epi16(ymmRes1, ymm1);
ymm0 = _mm256_cvtepi8_epi16(p_bnProcBuf[216 + j]);
ymmRes0 = _mm256_adds_epi16(ymmRes0, ymm0);
ymm1 = _mm256_cvtepi8_epi16(p_bnProcBuf[216 + j +1]);
ymmRes1 = _mm256_adds_epi16(ymmRes1, ymm1);
ymm0 = _mm256_cvtepi8_epi16(p_llrProcBuf[j]);
ymmRes0 = _mm256_adds_epi16(ymmRes0, ymm0);
ymm1 = _mm256_cvtepi8_epi16(p_llrProcBuf[j +1 ]);
ymmRes1 = _mm256_adds_epi16(ymmRes1, ymm1);
ymm0 = _mm256_packs_epi16(ymmRes0, ymmRes1);
*p_llrRes = _mm256_permute4x64_epi64(ymm0, 0xD8);
p_llrRes++;
}
// Process group with 5 CNs
M = (2*Z + 31)>>5;
p_bnProcBuf = (__m128i*) &bnProcBuf [13824];
p_llrProcBuf = (__m128i*) &llrProcBuf [5376];
p_llrRes = (__m256i*) &llrRes [5376];
for (int i=0,j=0;i<M;i++,j+=2) {
ymmRes0 = _mm256_cvtepi8_epi16(p_bnProcBuf [j]);
ymmRes1 = _mm256_cvtepi8_epi16(p_bnProcBuf [j +1]);
ymm0 = _mm256_cvtepi8_epi16(p_bnProcBuf[48 + j]);
ymmRes0 = _mm256_adds_epi16(ymmRes0, ymm0);
ymm1 = _mm256_cvtepi8_epi16(p_bnProcBuf[48 + j +1]);
ymmRes1 = _mm256_adds_epi16(ymmRes1, ymm1);
ymm0 = _mm256_cvtepi8_epi16(p_bnProcBuf[96 + j]);
ymmRes0 = _mm256_adds_epi16(ymmRes0, ymm0);
ymm1 = _mm256_cvtepi8_epi16(p_bnProcBuf[96 + j +1]);
ymmRes1 = _mm256_adds_epi16(ymmRes1, ymm1);
ymm0 = _mm256_cvtepi8_epi16(p_bnProcBuf[144 + j]);
ymmRes0 = _mm256_adds_epi16(ymmRes0, ymm0);
ymm1 = _mm256_cvtepi8_epi16(p_bnProcBuf[144 + j +1]);
ymmRes1 = _mm256_adds_epi16(ymmRes1, ymm1);
ymm0 = _mm256_cvtepi8_epi16(p_bnProcBuf[192 + j]);
ymmRes0 = _mm256_adds_epi16(ymmRes0, ymm0);
ymm1 = _mm256_cvtepi8_epi16(p_bnProcBuf[192 + j +1]);
ymmRes1 = _mm256_adds_epi16(ymmRes1, ymm1);
ymm0 = _mm256_cvtepi8_epi16(p_llrProcBuf[j]);
ymmRes0 = _mm256_adds_epi16(ymmRes0, ymm0);
ymm1 = _mm256_cvtepi8_epi16(p_llrProcBuf[j +1 ]);
ymmRes1 = _mm256_adds_epi16(ymmRes1, ymm1);
ymm0 = _mm256_packs_epi16(ymmRes0, ymmRes1);
*p_llrRes = _mm256_permute4x64_epi64(ymm0, 0xD8);
p_llrRes++;
}
// Process group with 6 CNs
M = (1*Z + 31)>>5;
p_bnProcBuf = (__m128i*) &bnProcBuf [17664];
p_llrProcBuf = (__m128i*) &llrProcBuf [6144];
p_llrRes = (__m256i*) &llrRes [6144];
for (int i=0,j=0;i<M;i++,j+=2) {
ymmRes0 = _mm256_cvtepi8_epi16(p_bnProcBuf [j]);
ymmRes1 = _mm256_cvtepi8_epi16(p_bnProcBuf [j +1]);
ymm0 = _mm256_cvtepi8_epi16(p_bnProcBuf[24 + j]);
ymmRes0 = _mm256_adds_epi16(ymmRes0, ymm0);
ymm1 = _mm256_cvtepi8_epi16(p_bnProcBuf[24 + j +1]);
ymmRes1 = _mm256_adds_epi16(ymmRes1, ymm1);
ymm0 = _mm256_cvtepi8_epi16(p_bnProcBuf[48 + j]);
ymmRes0 = _mm256_adds_epi16(ymmRes0, ymm0);
ymm1 = _mm256_cvtepi8_epi16(p_bnProcBuf[48 + j +1]);
ymmRes1 = _mm256_adds_epi16(ymmRes1, ymm1);
ymm0 = _mm256_cvtepi8_epi16(p_bnProcBuf[72 + j]);
ymmRes0 = _mm256_adds_epi16(ymmRes0, ymm0);
ymm1 = _mm256_cvtepi8_epi16(p_bnProcBuf[72 + j +1]);
ymmRes1 = _mm256_adds_epi16(ymmRes1, ymm1);
ymm0 = _mm256_cvtepi8_epi16(p_bnProcBuf[96 + j]);
ymmRes0 = _mm256_adds_epi16(ymmRes0, ymm0);
ymm1 = _mm256_cvtepi8_epi16(p_bnProcBuf[96 + j +1]);
ymmRes1 = _mm256_adds_epi16(ymmRes1, ymm1);
ymm0 = _mm256_cvtepi8_epi16(p_bnProcBuf[120 + j]);
ymmRes0 = _mm256_adds_epi16(ymmRes0, ymm0);
ymm1 = _mm256_cvtepi8_epi16(p_bnProcBuf[120 + j +1]);
ymmRes1 = _mm256_adds_epi16(ymmRes1, ymm1);
ymm0 = _mm256_cvtepi8_epi16(p_llrProcBuf[j]);
ymmRes0 = _mm256_adds_epi16(ymmRes0, ymm0);
ymm1 = _mm256_cvtepi8_epi16(p_llrProcBuf[j +1 ]);
ymmRes1 = _mm256_adds_epi16(ymmRes1, ymm1);
ymm0 = _mm256_packs_epi16(ymmRes0, ymmRes1);
*p_llrRes = _mm256_permute4x64_epi64(ymm0, 0xD8);
p_llrRes++;
}
// Process group with 7 CNs
// Process group with 8 CNs
// Process group with 9 CNs
// Process group with 10 CNs
// Process group with 11 CNs
// Process group with 12 CNs
// Process group with 13 CNs
// Process group with 14 CNs
// Process group with 15 CNs
// Process group with 16 CNs
// Process group with 17 CNs
// Process group with 18 CNs
// Process group with 19 CNs
// Process group with 20 CNs
// Process group with 21 CNs
// Process group with 22 CNs
// Process group with 23 CNs
// Process group with 24 CNs
// Process group with 25 CNs
// Process group with 26 CNs
// Process group with 27 CNs
// Process group with 28 CNs
// Process group with 29 CNs
// Process group with 30 CNs
}
static inline void nrLDPC_bnProc_BG1_R89_AVX512(int8_t* bnProcBuf,int8_t* bnProcBufRes, int8_t* llrRes, uint16_t Z ) {
__m512i* p_bnProcBuf;
__m512i* p_bnProcBufRes;
__m512i* p_llrRes;
__m512i* p_res;
uint32_t M, i;
// Process group with 2 CNs
M = (3*Z + 63)>>6;
p_bnProcBuf = (__m512i*) &bnProcBuf [384];
p_bnProcBufRes = (__m512i*) &bnProcBufRes [384];
p_res = &p_bnProcBufRes[0];
p_llrRes = (__m512i*) &llrRes [384];
for (i=0;i<M;i++) {
p_res[i] = _mm512_subs_epi8(p_llrRes[i], p_bnProcBuf[0 + i]);
}
p_res = &p_bnProcBufRes[18];
p_llrRes = (__m512i*) &llrRes [384];
for (i=0;i<M;i++) {
p_res[i] = _mm512_subs_epi8(p_llrRes[i], p_bnProcBuf[18 + i]);
}
// Process group with 3 CNs
M = (21*Z + 63)>>6;
p_bnProcBuf = (__m512i*) &bnProcBuf [2688];
p_bnProcBufRes = (__m512i*) &bnProcBufRes [2688];
p_res = &p_bnProcBufRes[0];
p_llrRes = (__m512i*) &llrRes [1536];
for (i=0;i<M;i++) {
p_res[i] = _mm512_subs_epi8(p_llrRes[i], p_bnProcBuf[0 + i]);
}
p_res = &p_bnProcBufRes[126];
p_llrRes = (__m512i*) &llrRes [1536];
for (i=0;i<M;i++) {
p_res[i] = _mm512_subs_epi8(p_llrRes[i], p_bnProcBuf[126 + i]);
}
p_res = &p_bnProcBufRes[252];
p_llrRes = (__m512i*) &llrRes [1536];
for (i=0;i<M;i++) {
p_res[i] = _mm512_subs_epi8(p_llrRes[i], p_bnProcBuf[252 + i]);
}
// Process group with 4 CNs
M = (1*Z + 63)>>6;
p_bnProcBuf = (__m512i*) &bnProcBuf [26880];
p_bnProcBufRes = (__m512i*) &bnProcBufRes [26880];
p_res = &p_bnProcBufRes[0];
p_llrRes = (__m512i*) &llrRes [9600];
for (i=0;i<M;i++) {
p_res[i] = _mm512_subs_epi8(p_llrRes[i], p_bnProcBuf[0 + i]);
}
p_res = &p_bnProcBufRes[6];
p_llrRes = (__m512i*) &llrRes [9600];
for (i=0;i<M;i++) {
p_res[i] = _mm512_subs_epi8(p_llrRes[i], p_bnProcBuf[6 + i]);
}
p_res = &p_bnProcBufRes[12];
p_llrRes = (__m512i*) &llrRes [9600];
for (i=0;i<M;i++) {
p_res[i] = _mm512_subs_epi8(p_llrRes[i], p_bnProcBuf[12 + i]);
}
p_res = &p_bnProcBufRes[18];
p_llrRes = (__m512i*) &llrRes [9600];
for (i=0;i<M;i++) {
p_res[i] = _mm512_subs_epi8(p_llrRes[i], p_bnProcBuf[18 + i]);
}
// Process group with 5 CNs
M = (1*Z + 63)>>6;
p_bnProcBuf = (__m512i*) &bnProcBuf [28416];
p_bnProcBufRes = (__m512i*) &bnProcBufRes [28416];
p_res = &p_bnProcBufRes[0];
p_llrRes = (__m512i*) &llrRes [9984];
for (i=0;i<M;i++) {
p_res[i] = _mm512_subs_epi8(p_llrRes[i], p_bnProcBuf[0 + i]);
}
p_res = &p_bnProcBufRes[6];
p_llrRes = (__m512i*) &llrRes [9984];
for (i=0;i<M;i++) {
p_res[i] = _mm512_subs_epi8(p_llrRes[i], p_bnProcBuf[6 + i]);
}
p_res = &p_bnProcBufRes[12];
p_llrRes = (__m512i*) &llrRes [9984];
for (i=0;i<M;i++) {
p_res[i] = _mm512_subs_epi8(p_llrRes[i], p_bnProcBuf[12 + i]);
}
p_res = &p_bnProcBufRes[18];
p_llrRes = (__m512i*) &llrRes [9984];
for (i=0;i<M;i++) {
p_res[i] = _mm512_subs_epi8(p_llrRes[i], p_bnProcBuf[18 + i]);
}
p_res = &p_bnProcBufRes[24];
p_llrRes = (__m512i*) &llrRes [9984];
for (i=0;i<M;i++) {
p_res[i] = _mm512_subs_epi8(p_llrRes[i], p_bnProcBuf[24 + i]);
}
// Process group with 6 CNs
// Process group with 7 CNs
// Process group with 8 CNs
// Process group with 9 CNs
// Process group with 10 CNs
// Process group with 11 CNs
// Process group with 12 CNs
// Process group with 13 CNs
// Process group with 14 CNs
// Process group with 15 CNs
// Process group with 16 CNs
// Process group with 17 CNs
// Process group with 18 CNs
// Process group with 19 CNs
// Process group with 20 CNs
// Process group with 21 CNs
// Process group with 22 CNs
// Process group with <23 CNs
// Process group with 24 CNs
// Process group with 25 CNs
// Process group with 26 CNs
// Process group with 27 CNs
// Process group with 28 CNs
// Process group with 29 CNs
// Process group with 30 CNs
}
#include <stdint.h>
#include <immintrin.h>
void nrLDPC_bnProc_BG2_R23_AVX2(int8_t* bnProcBuf,int8_t* bnProcBufRes, int8_t* llrRes, uint16_t Z ) {
__m256i* p_bnProcBuf;
__m256i* p_bnProcBufRes;
__m256i* p_llrRes;
__m256i* p_res;
void nrLDPC_bnProc_BG2_R23_AVX512(int8_t* bnProcBuf,int8_t* bnProcBufRes, int8_t* llrRes, uint16_t Z ) {
__m512i* p_bnProcBuf;
__m512i* p_bnProcBufRes;
__m512i* p_llrRes;
__m512i* p_res;
uint32_t M, i;
// Process group with 2 CNs
M = (3*Z + 31)>>5;
p_bnProcBuf = (__m256i*) &bnProcBuf [1152];
p_bnProcBufRes = (__m256i*) &bnProcBufRes [1152];
M = (3*Z + 63)>>6;
p_bnProcBuf = (__m512i*) &bnProcBuf [1152];
p_bnProcBufRes = (__m512i*) &bnProcBufRes [1152];
p_res = &p_bnProcBufRes[0];
p_llrRes = (__m256i*) &llrRes [1152];
p_llrRes = (__m512i*) &llrRes [1152];
for (i=0;i<M;i++) {
p_res[i] = _mm256_subs_epi8(p_llrRes[i], p_bnProcBuf[0 + i]);
p_res[i] = _mm512_subs_epi8(p_llrRes[i], p_bnProcBuf[0 + i]);
}
p_res = &p_bnProcBufRes[36];
p_llrRes = (__m256i*) &llrRes [1152];
p_res = &p_bnProcBufRes[18];
p_llrRes = (__m512i*) &llrRes [1152];
for (i=0;i<M;i++) {
p_res[i] = _mm256_subs_epi8(p_llrRes[i], p_bnProcBuf[36 + i]);
p_res[i] = _mm512_subs_epi8(p_llrRes[i], p_bnProcBuf[18 + i]);
}
// Process group with 3 CNs
M = (5*Z + 31)>>5;
p_bnProcBuf = (__m256i*) &bnProcBuf [3456];
p_bnProcBufRes = (__m256i*) &bnProcBufRes [3456];
M = (5*Z + 63)>>6;
p_bnProcBuf = (__m512i*) &bnProcBuf [3456];
p_bnProcBufRes = (__m512i*) &bnProcBufRes [3456];
p_res = &p_bnProcBufRes[0];
p_llrRes = (__m256i*) &llrRes [2304];
p_llrRes = (__m512i*) &llrRes [2304];
for (i=0;i<M;i++) {
p_res[i] = _mm256_subs_epi8(p_llrRes[i], p_bnProcBuf[0 + i]);
p_res[i] = _mm512_subs_epi8(p_llrRes[i], p_bnProcBuf[0 + i]);
}
p_res = &p_bnProcBufRes[60];
p_llrRes = (__m256i*) &llrRes [2304];
p_res = &p_bnProcBufRes[30];
p_llrRes = (__m512i*) &llrRes [2304];
for (i=0;i<M;i++) {
p_res[i] = _mm256_subs_epi8(p_llrRes[i], p_bnProcBuf[60 + i]);
p_res[i] = _mm512_subs_epi8(p_llrRes[i], p_bnProcBuf[30 + i]);
}
p_res = &p_bnProcBufRes[120];
p_llrRes = (__m256i*) &llrRes [2304];
p_res = &p_bnProcBufRes[60];
p_llrRes = (__m512i*) &llrRes [2304];
for (i=0;i<M;i++) {
p_res[i] = _mm256_subs_epi8(p_llrRes[i], p_bnProcBuf[120 + i]);
p_res[i] = _mm512_subs_epi8(p_llrRes[i], p_bnProcBuf[60 + i]);
}
// Process group with 4 CNs
M = (3*Z + 31)>>5;
p_bnProcBuf = (__m256i*) &bnProcBuf [9216];
p_bnProcBufRes = (__m256i*) &bnProcBufRes [9216];
M = (3*Z + 63)>>6;
p_bnProcBuf = (__m512i*) &bnProcBuf [9216];
p_bnProcBufRes = (__m512i*) &bnProcBufRes [9216];
p_res = &p_bnProcBufRes[0];
p_llrRes = (__m256i*) &llrRes [4224];
p_llrRes = (__m512i*) &llrRes [4224];
for (i=0;i<M;i++) {
p_res[i] = _mm256_subs_epi8(p_llrRes[i], p_bnProcBuf[0 + i]);
p_res[i] = _mm512_subs_epi8(p_llrRes[i], p_bnProcBuf[0 + i]);
}
p_res = &p_bnProcBufRes[36];
p_llrRes = (__m256i*) &llrRes [4224];
p_res = &p_bnProcBufRes[18];
p_llrRes = (__m512i*) &llrRes [4224];
for (i=0;i<M;i++) {
p_res[i] = _mm256_subs_epi8(p_llrRes[i], p_bnProcBuf[36 + i]);
p_res[i] = _mm512_subs_epi8(p_llrRes[i], p_bnProcBuf[18 + i]);
}
p_res = &p_bnProcBufRes[72];
p_llrRes = (__m256i*) &llrRes [4224];
p_res = &p_bnProcBufRes[36];
p_llrRes = (__m512i*) &llrRes [4224];
for (i=0;i<M;i++) {
p_res[i] = _mm256_subs_epi8(p_llrRes[i], p_bnProcBuf[72 + i]);
p_res[i] = _mm512_subs_epi8(p_llrRes[i], p_bnProcBuf[36 + i]);
}
p_res = &p_bnProcBufRes[108];
p_llrRes = (__m256i*) &llrRes [4224];
p_res = &p_bnProcBufRes[54];
p_llrRes = (__m512i*) &llrRes [4224];
for (i=0;i<M;i++) {
p_res[i] = _mm256_subs_epi8(p_llrRes[i], p_bnProcBuf[108 + i]);
p_res[i] = _mm512_subs_epi8(p_llrRes[i], p_bnProcBuf[54 + i]);
}
// Process group with 5 CNs
M = (2*Z + 31)>>5;
p_bnProcBuf = (__m256i*) &bnProcBuf [13824];
p_bnProcBufRes = (__m256i*) &bnProcBufRes [13824];
M = (2*Z + 63)>>6;
p_bnProcBuf = (__m512i*) &bnProcBuf [13824];
p_bnProcBufRes = (__m512i*) &bnProcBufRes [13824];
p_res = &p_bnProcBufRes[0];
p_llrRes = (__m256i*) &llrRes [5376];
p_llrRes = (__m512i*) &llrRes [5376];
for (i=0;i<M;i++) {
p_res[i] = _mm256_subs_epi8(p_llrRes[i], p_bnProcBuf[0 + i]);
p_res[i] = _mm512_subs_epi8(p_llrRes[i], p_bnProcBuf[0 + i]);
}
p_res = &p_bnProcBufRes[24];
p_llrRes = (__m256i*) &llrRes [5376];
p_res = &p_bnProcBufRes[12];
p_llrRes = (__m512i*) &llrRes [5376];
for (i=0;i<M;i++) {
p_res[i] = _mm256_subs_epi8(p_llrRes[i], p_bnProcBuf[24 + i]);
p_res[i] = _mm512_subs_epi8(p_llrRes[i], p_bnProcBuf[12 + i]);
}
p_res = &p_bnProcBufRes[48];
p_llrRes = (__m256i*) &llrRes [5376];
p_res = &p_bnProcBufRes[24];
p_llrRes = (__m512i*) &llrRes [5376];
for (i=0;i<M;i++) {
p_res[i] = _mm256_subs_epi8(p_llrRes[i], p_bnProcBuf[48 + i]);
p_res[i] = _mm512_subs_epi8(p_llrRes[i], p_bnProcBuf[24 + i]);
}
p_res = &p_bnProcBufRes[72];
p_llrRes = (__m256i*) &llrRes [5376];
p_res = &p_bnProcBufRes[36];
p_llrRes = (__m512i*) &llrRes [5376];
for (i=0;i<M;i++) {
p_res[i] = _mm256_subs_epi8(p_llrRes[i], p_bnProcBuf[72 + i]);
p_res[i] = _mm512_subs_epi8(p_llrRes[i], p_bnProcBuf[36 + i]);
}
p_res = &p_bnProcBufRes[96];
p_llrRes = (__m256i*) &llrRes [5376];
p_res = &p_bnProcBufRes[48];
p_llrRes = (__m512i*) &llrRes [5376];
for (i=0;i<M;i++) {
p_res[i] = _mm256_subs_epi8(p_llrRes[i], p_bnProcBuf[96 + i]);
p_res[i] = _mm512_subs_epi8(p_llrRes[i], p_bnProcBuf[48 + i]);
}
// Process group with 6 CNs
M = (1*Z + 31)>>5;
p_bnProcBuf = (__m256i*) &bnProcBuf [17664];
p_bnProcBufRes = (__m256i*) &bnProcBufRes [17664];
M = (1*Z + 63)>>6;
p_bnProcBuf = (__m512i*) &bnProcBuf [17664];
p_bnProcBufRes = (__m512i*) &bnProcBufRes [17664];
p_res = &p_bnProcBufRes[0];
p_llrRes = (__m256i*) &llrRes [6144];
p_llrRes = (__m512i*) &llrRes [6144];
for (i=0;i<M;i++) {
p_res[i] = _mm256_subs_epi8(p_llrRes[i], p_bnProcBuf[0 + i]);
p_res[i] = _mm512_subs_epi8(p_llrRes[i], p_bnProcBuf[0 + i]);
}
p_res = &p_bnProcBufRes[12];
p_llrRes = (__m256i*) &llrRes [6144];
p_res = &p_bnProcBufRes[6];
p_llrRes = (__m512i*) &llrRes [6144];
for (i=0;i<M;i++) {
p_res[i] = _mm256_subs_epi8(p_llrRes[i], p_bnProcBuf[12 + i]);
p_res[i] = _mm512_subs_epi8(p_llrRes[i], p_bnProcBuf[6 + i]);
}
p_res = &p_bnProcBufRes[24];
p_llrRes = (__m256i*) &llrRes [6144];
p_res = &p_bnProcBufRes[12];
p_llrRes = (__m512i*) &llrRes [6144];
for (i=0;i<M;i++) {
p_res[i] = _mm256_subs_epi8(p_llrRes[i], p_bnProcBuf[24 + i]);
p_res[i] = _mm512_subs_epi8(p_llrRes[i], p_bnProcBuf[12 + i]);
}
p_res = &p_bnProcBufRes[36];
p_llrRes = (__m256i*) &llrRes [6144];
p_res = &p_bnProcBufRes[18];
p_llrRes = (__m512i*) &llrRes [6144];
for (i=0;i<M;i++) {
p_res[i] = _mm256_subs_epi8(p_llrRes[i], p_bnProcBuf[36 + i]);
p_res[i] = _mm512_subs_epi8(p_llrRes[i], p_bnProcBuf[18 + i]);
}
p_res = &p_bnProcBufRes[48];
p_llrRes = (__m256i*) &llrRes [6144];
p_res = &p_bnProcBufRes[24];
p_llrRes = (__m512i*) &llrRes [6144];
for (i=0;i<M;i++) {
p_res[i] = _mm256_subs_epi8(p_llrRes[i], p_bnProcBuf[48 + i]);
p_res[i] = _mm512_subs_epi8(p_llrRes[i], p_bnProcBuf[24 + i]);
}
p_res = &p_bnProcBufRes[60];
p_llrRes = (__m256i*) &llrRes [6144];
p_res = &p_bnProcBufRes[30];
p_llrRes = (__m512i*) &llrRes [6144];
for (i=0;i<M;i++) {
p_res[i] = _mm256_subs_epi8(p_llrRes[i], p_bnProcBuf[60 + i]);
p_res[i] = _mm512_subs_epi8(p_llrRes[i], p_bnProcBuf[30 + i]);
}
// Process group with 7 CNs
// Process group with 8 CNs
......
Markdown is supported
0%
or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment