Commit 559e8fdc authored by Sy's avatar Sy

degradation of performance in BLER fixed | bnProc & bnProcPc unrolled |...

degradation of performance in   BLER fixed | bnProc & bnProcPc unrolled | small improvement in times
parent 0f704894
/*
* Licensed to the OpenAirInterface (OAI) Software Alliance under one or more
* contributor license agreements. See the NOTICE file distributed with
......@@ -48,7 +49,6 @@
#include "nrLDPC_tools/ldpc_gen_files/cnProc_avx512/nrLDPC_cnProc_BG2_R13_AVX512.h"
#include "nrLDPC_tools/ldpc_gen_files/cnProc_avx512/nrLDPC_cnProc_BG2_R23_AVX512.h"
#else
/*----------------------------------------------------------------------
......@@ -81,7 +81,18 @@
#include "nrLDPC_tools/ldpc_gen_files/bnProcPc/nrLDPC_bnProcPc_BG2_R23_AVX2.h"
//bnProc----------------------------------------------------------------
#ifdef __AVX512BW__
//BG1-------------------------------------------------------------------
#include "nrLDPC_tools/ldpc_gen_files/bnProc_avx512/nrLDPC_bnProc_BG1_R13_AVX512.h"
#include "nrLDPC_tools/ldpc_gen_files/bnProc_avx512/nrLDPC_bnProc_BG1_R23_AVX512.h"
#include "nrLDPC_tools/ldpc_gen_files/bnProc_avx512/nrLDPC_bnProc_BG1_R89_AVX512.h"
//BG2 --------------------------------------------------------------------
#include "nrLDPC_tools/ldpc_gen_files/bnProc_avx512/nrLDPC_bnProc_BG2_R15_AVX512.h"
#include "nrLDPC_tools/ldpc_gen_files/bnProc_avx512/nrLDPC_bnProc_BG2_R13_AVX512.h"
#include "nrLDPC_tools/ldpc_gen_files/bnProc_avx512/nrLDPC_bnProc_BG2_R23_AVX512.h"
#else
#include "nrLDPC_tools/ldpc_gen_files/bnProc/nrLDPC_bnProc_BG1_R13_AVX2.h"
#include "nrLDPC_tools/ldpc_gen_files/bnProc/nrLDPC_bnProc_BG1_R23_AVX2.h"
#include "nrLDPC_tools/ldpc_gen_files/bnProc/nrLDPC_bnProc_BG1_R89_AVX2.h"
......@@ -90,6 +101,7 @@
#include "nrLDPC_tools/ldpc_gen_files/bnProc/nrLDPC_bnProc_BG2_R13_AVX2.h"
#include "nrLDPC_tools/ldpc_gen_files/bnProc/nrLDPC_bnProc_BG2_R23_AVX2.h"
#endif
......@@ -380,17 +392,29 @@ if (BG==1)
{
case 13:
{
#ifdef __AVX512BW__
nrLDPC_bnProc_BG1_R13_AVX512(p_procBuf->bnProcBuf, p_procBuf->bnProcBufRes,p_procBuf->llrRes, Z);
#else
nrLDPC_bnProc_BG1_R13_AVX2(p_procBuf->bnProcBuf, p_procBuf->bnProcBufRes,p_procBuf->llrRes, Z);
#endif
break;
}
case 23:
{
#ifdef __AVX512BW__
nrLDPC_bnProc_BG1_R23_AVX512(p_procBuf->bnProcBuf, p_procBuf->bnProcBufRes,p_procBuf->llrRes, Z);
#else
nrLDPC_bnProc_BG1_R23_AVX2(p_procBuf->bnProcBuf, p_procBuf->bnProcBufRes,p_procBuf->llrRes, Z);
#endif
break;
}
case 89:
{
#ifdef __AVX512BW__
nrLDPC_bnProc_BG1_R89_AVX512(p_procBuf->bnProcBuf, p_procBuf->bnProcBufRes,p_procBuf->llrRes, Z);
#else
nrLDPC_bnProc_BG1_R89_AVX2(p_procBuf->bnProcBuf, p_procBuf->bnProcBufRes,p_procBuf->llrRes, Z);
#endif
break;
}
}
......@@ -401,20 +425,32 @@ if (BG==1)
{
case 15:
{
#ifdef __AVX512BW__
nrLDPC_bnProc_BG2_R15_AVX512(p_procBuf->bnProcBuf, p_procBuf->bnProcBufRes,p_procBuf->llrRes, Z);
#else
nrLDPC_bnProc_BG2_R15_AVX2(p_procBuf->bnProcBuf, p_procBuf->bnProcBufRes,p_procBuf->llrRes, Z);
#endif
break;
}
case 13:
{
#ifdef __AVX512BW__
nrLDPC_bnProc_BG2_R13_AVX512(p_procBuf->bnProcBuf, p_procBuf->bnProcBufRes,p_procBuf->llrRes, Z);
#else
nrLDPC_bnProc_BG2_R13_AVX2(p_procBuf->bnProcBuf, p_procBuf->bnProcBufRes,p_procBuf->llrRes, Z);
#endif
break;
}
case 23:
{
#ifdef __AVX512BW__
nrLDPC_bnProc_BG2_R23_AVX512(p_procBuf->bnProcBuf, p_procBuf->bnProcBufRes,p_procBuf->llrRes, Z);
#else
nrLDPC_bnProc_BG2_R23_AVX2(p_procBuf->bnProcBuf, p_procBuf->bnProcBufRes,p_procBuf->llrRes, Z);
#endif
break;
}
......@@ -634,23 +670,36 @@ if (BG==1)
#endif
// nrLDPC_bnProc(p_lut, p_procBuf, Z);
if (BG==1)
{
switch (R)
{
case 13:
{
#ifdef __AVX512BW__
nrLDPC_bnProc_BG1_R13_AVX512(p_procBuf->bnProcBuf, p_procBuf->bnProcBufRes,p_procBuf->llrRes, Z);
#else
nrLDPC_bnProc_BG1_R13_AVX2(p_procBuf->bnProcBuf, p_procBuf->bnProcBufRes,p_procBuf->llrRes, Z);
#endif
break;
}
case 23:
{
#ifdef __AVX512BW__
nrLDPC_bnProc_BG1_R23_AVX512(p_procBuf->bnProcBuf, p_procBuf->bnProcBufRes,p_procBuf->llrRes, Z);
#else
nrLDPC_bnProc_BG1_R23_AVX2(p_procBuf->bnProcBuf, p_procBuf->bnProcBufRes,p_procBuf->llrRes, Z);
#endif
break;
}
case 89:
{
#ifdef __AVX512BW__
nrLDPC_bnProc_BG1_R89_AVX512(p_procBuf->bnProcBuf, p_procBuf->bnProcBufRes,p_procBuf->llrRes, Z);
#else
nrLDPC_bnProc_BG1_R89_AVX2(p_procBuf->bnProcBuf, p_procBuf->bnProcBufRes,p_procBuf->llrRes, Z);
#endif
break;
}
}
......@@ -661,20 +710,32 @@ if (BG==1)
{
case 15:
{
#ifdef __AVX512BW__
nrLDPC_bnProc_BG2_R15_AVX512(p_procBuf->bnProcBuf, p_procBuf->bnProcBufRes,p_procBuf->llrRes, Z);
#else
nrLDPC_bnProc_BG2_R15_AVX2(p_procBuf->bnProcBuf, p_procBuf->bnProcBufRes,p_procBuf->llrRes, Z);
#endif
break;
}
case 13:
{
#ifdef __AVX512BW__
nrLDPC_bnProc_BG2_R13_AVX512(p_procBuf->bnProcBuf, p_procBuf->bnProcBufRes,p_procBuf->llrRes, Z);
#else
nrLDPC_bnProc_BG2_R13_AVX2(p_procBuf->bnProcBuf, p_procBuf->bnProcBufRes,p_procBuf->llrRes, Z);
#endif
break;
}
case 23:
{
#ifdef __AVX512BW__
nrLDPC_bnProc_BG2_R23_AVX512(p_procBuf->bnProcBuf, p_procBuf->bnProcBufRes,p_procBuf->llrRes, Z);
#else
nrLDPC_bnProc_BG2_R23_AVX2(p_procBuf->bnProcBuf, p_procBuf->bnProcBufRes,p_procBuf->llrRes, Z);
#endif
break;
}
......@@ -684,7 +745,6 @@ if (BG==1)
#ifdef NR_LDPC_PROFILER_DETAIL
stop_meas(&p_profiler->bnProc);
#endif
......@@ -915,23 +975,36 @@ if (BG==1)
//nrLDPC_bnProc(p_lut, p_procBuf, Z);
if (BG==1)
{
switch (R)
{
case 13:
{
#ifdef __AVX512BW__
nrLDPC_bnProc_BG1_R13_AVX512(p_procBuf->bnProcBuf, p_procBuf->bnProcBufRes,p_procBuf->llrRes, Z);
#else
nrLDPC_bnProc_BG1_R13_AVX2(p_procBuf->bnProcBuf, p_procBuf->bnProcBufRes,p_procBuf->llrRes, Z);
#endif
break;
}
case 23:
{
#ifdef __AVX512BW__
nrLDPC_bnProc_BG1_R23_AVX512(p_procBuf->bnProcBuf, p_procBuf->bnProcBufRes,p_procBuf->llrRes, Z);
#else
nrLDPC_bnProc_BG1_R23_AVX2(p_procBuf->bnProcBuf, p_procBuf->bnProcBufRes,p_procBuf->llrRes, Z);
#endif
break;
}
case 89:
{
#ifdef __AVX512BW__
nrLDPC_bnProc_BG1_R89_AVX512(p_procBuf->bnProcBuf, p_procBuf->bnProcBufRes,p_procBuf->llrRes, Z);
#else
nrLDPC_bnProc_BG1_R89_AVX2(p_procBuf->bnProcBuf, p_procBuf->bnProcBufRes,p_procBuf->llrRes, Z);
#endif
break;
}
}
......@@ -942,20 +1015,32 @@ if (BG==1)
{
case 15:
{
#ifdef __AVX512BW__
nrLDPC_bnProc_BG2_R15_AVX512(p_procBuf->bnProcBuf, p_procBuf->bnProcBufRes,p_procBuf->llrRes, Z);
#else
nrLDPC_bnProc_BG2_R15_AVX2(p_procBuf->bnProcBuf, p_procBuf->bnProcBufRes,p_procBuf->llrRes, Z);
#endif
break;
}
case 13:
{
#ifdef __AVX512BW__
nrLDPC_bnProc_BG2_R13_AVX512(p_procBuf->bnProcBuf, p_procBuf->bnProcBufRes,p_procBuf->llrRes, Z);
#else
nrLDPC_bnProc_BG2_R13_AVX2(p_procBuf->bnProcBuf, p_procBuf->bnProcBufRes,p_procBuf->llrRes, Z);
#endif
break;
}
case 23:
{
#ifdef __AVX512BW__
nrLDPC_bnProc_BG2_R23_AVX512(p_procBuf->bnProcBuf, p_procBuf->bnProcBufRes,p_procBuf->llrRes, Z);
#else
nrLDPC_bnProc_BG2_R23_AVX2(p_procBuf->bnProcBuf, p_procBuf->bnProcBufRes,p_procBuf->llrRes, Z);
#endif
break;
}
......@@ -964,7 +1049,6 @@ if (BG==1)
}
#ifdef NR_LDPC_PROFILER_DETAIL
stop_meas(&p_profiler->bnProc);
#endif
......@@ -1052,5 +1136,3 @@ if (BG==1)
C=gcc
CFLAGS=-W -Wall -mavx2
LDFLAGS=
EXEC=bnProc_gen_avx512
SRC= $(wildcard *.c)
OBJ= $(SRC:.c=.o)
all: $(EXEC)
bnProc_gen_avx512: $(OBJ)
@$(CC) -o $@ $^ $(LDFLAGS) -O2
%.o: %.c
@$(CC) -o $@ -c $< $(CFLAGS) -I ${OPENAIR_HOME}/openair1 -g -std=c99
.PHONY: clean mrproper
clean:
@rm -rf *.o
mrproper: clean
@rm -rf $(EXEC)
zip:
@tar -zcvf sauvegarde.tar.gz main.c bnProc_gen_BG1_avx512.c bnProc_gen_BG2_avx512.c Makefile
#include <stdio.h>
#include <stdint.h>
#define NB_R 3
void nrLDPC_bnProc_BG1_generator_AVX512(int);
void nrLDPC_bnProc_BG2_generator_AVX512(int);
//void nrLDPC_bnProcPc_BG1_generator_AVX2(int);
//void nrLDPC_bnProcPc_BG2_generator_AVX2(int);
int main()
{
int R[NB_R]={0,1,2};
for(int i=0; i<NB_R;i++){
nrLDPC_bnProc_BG1_generator_AVX512(R[i]);
nrLDPC_bnProc_BG2_generator_AVX512(R[i]);
// nrLDPC_bnProcPc_BG1_generator_AVX2(R[i]);
// nrLDPC_bnProcPc_BG2_generator_AVX2(R[i]);
}
return(0);
}
static inline void nrLDPC_bnProcPc_BG2_R89_AVX2(int8_t* bnProcBuf,int8_t* bnProcBufRes, int8_t* llrRes , int8_t* llrProcBuf, uint16_t Z ) {
__m256i ymm0, ymm1, ymmRes0, ymmRes1;
__m128i* p_bnProcBuf;
__m256i* p_bnProcBufRes;
__m128i* p_llrProcBuf;
__m256i* p_llrProcBuf256;
__m256i* p_llrRes;
uint32_t M ;
// Process group with 1 CNs
M = (0*Z + 31)>>5;
p_bnProcBuf = (__m128i*) &bnProcBuf [0];
p_bnProcBufRes = (__m256i*) &bnProcBufRes [0];
p_llrProcBuf = (__m128i*) &llrProcBuf [0];
p_llrProcBuf256 = (__m256i*) &llrProcBuf [0];
p_llrRes = (__m256i*) &llrRes [0];
for (int i=0,j=0;i<M;i++,j+=2) {
p_bnProcBufRes[i] = p_llrProcBuf256[i];
ymm0 = _mm256_cvtepi8_epi16(p_bnProcBuf [j]);
ymm1 = _mm256_cvtepi8_epi16(p_llrProcBuf[j]);
ymmRes0 = _mm256_adds_epi16(ymm0, ymm1);
ymm0 = _mm256_cvtepi8_epi16(p_bnProcBuf [j+1]);
ymm1 = _mm256_cvtepi8_epi16(p_llrProcBuf[j+1]);
ymmRes1 = _mm256_adds_epi16(ymm0, ymm1);
ymm0 = _mm256_packs_epi16(ymmRes0, ymmRes1);
*p_llrRes = _mm256_permute4x64_epi64(ymm0, 0xD8);
p_llrRes++;
}
// Process group with 2 CNs
M = (3*Z + 31)>>5;
p_bnProcBuf = (__m128i*) &bnProcBuf [1152];
p_llrProcBuf = (__m128i*) &llrProcBuf [1152];
p_llrRes = (__m256i*) &llrRes [1152];
for (int i=0,j=0;i<M;i++,j+=2) {
ymm0 = _mm256_cvtepi8_epi16(p_bnProcBuf [j]);
ymm1 = _mm256_cvtepi8_epi16(p_llrProcBuf[j]);
ymm0 = _mm256_cvtepi8_epi16(p_bnProcBuf[72 + j]);
ymmRes0 = _mm256_adds_epi16(ymmRes0, ymm0);
ymm1 = _mm256_cvtepi8_epi16(p_bnProcBuf[72 + j +1]);
ymmRes1 = _mm256_adds_epi16(ymmRes1, ymm1);
ymm0 = _mm256_cvtepi8_epi16(p_llrProcBuf[j]);
ymmRes0 = _mm256_adds_epi16(ymmRes0, ymm0);
ymm1 = _mm256_cvtepi8_epi16(p_llrProcBuf[j +1 ]);
ymmRes1 = _mm256_adds_epi16(ymmRes1, ymm1);
ymm0 = _mm256_packs_epi16(ymmRes0, ymmRes1);
*p_llrRes = _mm256_permute4x64_epi64(ymm0, 0xD8);
p_llrRes++;
}
// Process group with 3 CNs
M = (5*Z + 31)>>5;
p_bnProcBuf = (__m128i*) &bnProcBuf [3456];
p_llrProcBuf = (__m128i*) &llrProcBuf [2304];
p_llrRes = (__m256i*) &llrRes [2304];
for (int i=0,j=0;i<M;i++,j+=2) {
ymmRes0 = _mm256_cvtepi8_epi16(p_bnProcBuf [j]);
ymmRes1 = _mm256_cvtepi8_epi16(p_bnProcBuf [j +1]);
ymm0 = _mm256_cvtepi8_epi16(p_bnProcBuf[120 + j]);
ymmRes0 = _mm256_adds_epi16(ymmRes0, ymm0);
ymm1 = _mm256_cvtepi8_epi16(p_bnProcBuf[120 + j +1]);
ymmRes1 = _mm256_adds_epi16(ymmRes1, ymm1);
ymm0 = _mm256_cvtepi8_epi16(p_bnProcBuf[240 + j]);
ymmRes0 = _mm256_adds_epi16(ymmRes0, ymm0);
ymm1 = _mm256_cvtepi8_epi16(p_bnProcBuf[240 + j +1]);
ymmRes1 = _mm256_adds_epi16(ymmRes1, ymm1);
ymm0 = _mm256_cvtepi8_epi16(p_llrProcBuf[j]);
ymmRes0 = _mm256_adds_epi16(ymmRes0, ymm0);
ymm1 = _mm256_cvtepi8_epi16(p_llrProcBuf[j +1 ]);
ymmRes1 = _mm256_adds_epi16(ymmRes1, ymm1);
ymm0 = _mm256_packs_epi16(ymmRes0, ymmRes1);
*p_llrRes = _mm256_permute4x64_epi64(ymm0, 0xD8);
p_llrRes++;
}
// Process group with 4 CNs
M = (3*Z + 31)>>5;
p_bnProcBuf = (__m128i*) &bnProcBuf [9216];
p_llrProcBuf = (__m128i*) &llrProcBuf [4224];
p_llrRes = (__m256i*) &llrRes [4224];
for (int i=0,j=0;i<M;i++,j+=2) {
ymmRes0 = _mm256_cvtepi8_epi16(p_bnProcBuf [j]);
ymmRes1 = _mm256_cvtepi8_epi16(p_bnProcBuf [j +1]);
ymm0 = _mm256_cvtepi8_epi16(p_bnProcBuf[72 + j]);
ymmRes0 = _mm256_adds_epi16(ymmRes0, ymm0);
ymm1 = _mm256_cvtepi8_epi16(p_bnProcBuf[72 + j +1]);
ymmRes1 = _mm256_adds_epi16(ymmRes1, ymm1);
ymm0 = _mm256_cvtepi8_epi16(p_bnProcBuf[144 + j]);
ymmRes0 = _mm256_adds_epi16(ymmRes0, ymm0);
ymm1 = _mm256_cvtepi8_epi16(p_bnProcBuf[144 + j +1]);
ymmRes1 = _mm256_adds_epi16(ymmRes1, ymm1);
ymm0 = _mm256_cvtepi8_epi16(p_bnProcBuf[216 + j]);
ymmRes0 = _mm256_adds_epi16(ymmRes0, ymm0);
ymm1 = _mm256_cvtepi8_epi16(p_bnProcBuf[216 + j +1]);
ymmRes1 = _mm256_adds_epi16(ymmRes1, ymm1);
ymm0 = _mm256_cvtepi8_epi16(p_llrProcBuf[j]);
ymmRes0 = _mm256_adds_epi16(ymmRes0, ymm0);
ymm1 = _mm256_cvtepi8_epi16(p_llrProcBuf[j +1 ]);
ymmRes1 = _mm256_adds_epi16(ymmRes1, ymm1);
ymm0 = _mm256_packs_epi16(ymmRes0, ymmRes1);
*p_llrRes = _mm256_permute4x64_epi64(ymm0, 0xD8);
p_llrRes++;
}
// Process group with 5 CNs
M = (2*Z + 31)>>5;
p_bnProcBuf = (__m128i*) &bnProcBuf [13824];
p_llrProcBuf = (__m128i*) &llrProcBuf [5376];
p_llrRes = (__m256i*) &llrRes [5376];
for (int i=0,j=0;i<M;i++,j+=2) {
ymmRes0 = _mm256_cvtepi8_epi16(p_bnProcBuf [j]);
ymmRes1 = _mm256_cvtepi8_epi16(p_bnProcBuf [j +1]);
ymm0 = _mm256_cvtepi8_epi16(p_bnProcBuf[48 + j]);
ymmRes0 = _mm256_adds_epi16(ymmRes0, ymm0);
ymm1 = _mm256_cvtepi8_epi16(p_bnProcBuf[48 + j +1]);
ymmRes1 = _mm256_adds_epi16(ymmRes1, ymm1);
ymm0 = _mm256_cvtepi8_epi16(p_bnProcBuf[96 + j]);
ymmRes0 = _mm256_adds_epi16(ymmRes0, ymm0);
ymm1 = _mm256_cvtepi8_epi16(p_bnProcBuf[96 + j +1]);
ymmRes1 = _mm256_adds_epi16(ymmRes1, ymm1);
ymm0 = _mm256_cvtepi8_epi16(p_bnProcBuf[144 + j]);
ymmRes0 = _mm256_adds_epi16(ymmRes0, ymm0);
ymm1 = _mm256_cvtepi8_epi16(p_bnProcBuf[144 + j +1]);
ymmRes1 = _mm256_adds_epi16(ymmRes1, ymm1);
ymm0 = _mm256_cvtepi8_epi16(p_bnProcBuf[192 + j]);
ymmRes0 = _mm256_adds_epi16(ymmRes0, ymm0);
ymm1 = _mm256_cvtepi8_epi16(p_bnProcBuf[192 + j +1]);
ymmRes1 = _mm256_adds_epi16(ymmRes1, ymm1);
ymm0 = _mm256_cvtepi8_epi16(p_llrProcBuf[j]);
ymmRes0 = _mm256_adds_epi16(ymmRes0, ymm0);
ymm1 = _mm256_cvtepi8_epi16(p_llrProcBuf[j +1 ]);
ymmRes1 = _mm256_adds_epi16(ymmRes1, ymm1);
ymm0 = _mm256_packs_epi16(ymmRes0, ymmRes1);
*p_llrRes = _mm256_permute4x64_epi64(ymm0, 0xD8);
p_llrRes++;
}
// Process group with 6 CNs
M = (1*Z + 31)>>5;
p_bnProcBuf = (__m128i*) &bnProcBuf [17664];
p_llrProcBuf = (__m128i*) &llrProcBuf [6144];
p_llrRes = (__m256i*) &llrRes [6144];
for (int i=0,j=0;i<M;i++,j+=2) {
ymmRes0 = _mm256_cvtepi8_epi16(p_bnProcBuf [j]);
ymmRes1 = _mm256_cvtepi8_epi16(p_bnProcBuf [j +1]);
ymm0 = _mm256_cvtepi8_epi16(p_bnProcBuf[24 + j]);
ymmRes0 = _mm256_adds_epi16(ymmRes0, ymm0);
ymm1 = _mm256_cvtepi8_epi16(p_bnProcBuf[24 + j +1]);
ymmRes1 = _mm256_adds_epi16(ymmRes1, ymm1);
ymm0 = _mm256_cvtepi8_epi16(p_bnProcBuf[48 + j]);
ymmRes0 = _mm256_adds_epi16(ymmRes0, ymm0);
ymm1 = _mm256_cvtepi8_epi16(p_bnProcBuf[48 + j +1]);
ymmRes1 = _mm256_adds_epi16(ymmRes1, ymm1);
ymm0 = _mm256_cvtepi8_epi16(p_bnProcBuf[72 + j]);
ymmRes0 = _mm256_adds_epi16(ymmRes0, ymm0);
ymm1 = _mm256_cvtepi8_epi16(p_bnProcBuf[72 + j +1]);
ymmRes1 = _mm256_adds_epi16(ymmRes1, ymm1);
ymm0 = _mm256_cvtepi8_epi16(p_bnProcBuf[96 + j]);
ymmRes0 = _mm256_adds_epi16(ymmRes0, ymm0);
ymm1 = _mm256_cvtepi8_epi16(p_bnProcBuf[96 + j +1]);
ymmRes1 = _mm256_adds_epi16(ymmRes1, ymm1);
ymm0 = _mm256_cvtepi8_epi16(p_bnProcBuf[120 + j]);
ymmRes0 = _mm256_adds_epi16(ymmRes0, ymm0);
ymm1 = _mm256_cvtepi8_epi16(p_bnProcBuf[120 + j +1]);
ymmRes1 = _mm256_adds_epi16(ymmRes1, ymm1);
ymm0 = _mm256_cvtepi8_epi16(p_llrProcBuf[j]);
ymmRes0 = _mm256_adds_epi16(ymmRes0, ymm0);
ymm1 = _mm256_cvtepi8_epi16(p_llrProcBuf[j +1 ]);
ymmRes1 = _mm256_adds_epi16(ymmRes1, ymm1);
ymm0 = _mm256_packs_epi16(ymmRes0, ymmRes1);
*p_llrRes = _mm256_permute4x64_epi64(ymm0, 0xD8);
p_llrRes++;
}
// Process group with 7 CNs
// Process group with 8 CNs
// Process group with 9 CNs
// Process group with 10 CNs
// Process group with 11 CNs
// Process group with 12 CNs
// Process group with 13 CNs
// Process group with 14 CNs
// Process group with 15 CNs
// Process group with 16 CNs
// Process group with 17 CNs
// Process group with 18 CNs
// Process group with 19 CNs
// Process group with 20 CNs
// Process group with 21 CNs
// Process group with 22 CNs
// Process group with 23 CNs
// Process group with 24 CNs
// Process group with 25 CNs
// Process group with 26 CNs
// Process group with 27 CNs
// Process group with 28 CNs
// Process group with 29 CNs
// Process group with 30 CNs
}
static inline void nrLDPC_bnProc_BG1_R89_AVX512(int8_t* bnProcBuf,int8_t* bnProcBufRes, int8_t* llrRes, uint16_t Z ) {
__m512i* p_bnProcBuf;
__m512i* p_bnProcBufRes;
__m512i* p_llrRes;
__m512i* p_res;
uint32_t M, i;
// Process group with 2 CNs
M = (3*Z + 63)>>6;
p_bnProcBuf = (__m512i*) &bnProcBuf [384];
p_bnProcBufRes = (__m512i*) &bnProcBufRes [384];
p_res = &p_bnProcBufRes[0];
p_llrRes = (__m512i*) &llrRes [384];
for (i=0;i<M;i++) {
p_res[i] = _mm512_subs_epi8(p_llrRes[i], p_bnProcBuf[0 + i]);
}
p_res = &p_bnProcBufRes[18];
p_llrRes = (__m512i*) &llrRes [384];
for (i=0;i<M;i++) {
p_res[i] = _mm512_subs_epi8(p_llrRes[i], p_bnProcBuf[18 + i]);
}
// Process group with 3 CNs
M = (21*Z + 63)>>6;
p_bnProcBuf = (__m512i*) &bnProcBuf [2688];
p_bnProcBufRes = (__m512i*) &bnProcBufRes [2688];
p_res = &p_bnProcBufRes[0];
p_llrRes = (__m512i*) &llrRes [1536];
for (i=0;i<M;i++) {
p_res[i] = _mm512_subs_epi8(p_llrRes[i], p_bnProcBuf[0 + i]);
}
p_res = &p_bnProcBufRes[126];
p_llrRes = (__m512i*) &llrRes [1536];
for (i=0;i<M;i++) {
p_res[i] = _mm512_subs_epi8(p_llrRes[i], p_bnProcBuf[126 + i]);
}
p_res = &p_bnProcBufRes[252];
p_llrRes = (__m512i*) &llrRes [1536];
for (i=0;i<M;i++) {
p_res[i] = _mm512_subs_epi8(p_llrRes[i], p_bnProcBuf[252 + i]);
}
// Process group with 4 CNs
M = (1*Z + 63)>>6;
p_bnProcBuf = (__m512i*) &bnProcBuf [26880];
p_bnProcBufRes = (__m512i*) &bnProcBufRes [26880];
p_res = &p_bnProcBufRes[0];
p_llrRes = (__m512i*) &llrRes [9600];
for (i=0;i<M;i++) {
p_res[i] = _mm512_subs_epi8(p_llrRes[i], p_bnProcBuf[0 + i]);
}
p_res = &p_bnProcBufRes[6];
p_llrRes = (__m512i*) &llrRes [9600];
for (i=0;i<M;i++) {
p_res[i] = _mm512_subs_epi8(p_llrRes[i], p_bnProcBuf[6 + i]);
}
p_res = &p_bnProcBufRes[12];
p_llrRes = (__m512i*) &llrRes [9600];
for (i=0;i<M;i++) {
p_res[i] = _mm512_subs_epi8(p_llrRes[i], p_bnProcBuf[12 + i]);
}
p_res = &p_bnProcBufRes[18];
p_llrRes = (__m512i*) &llrRes [9600];
for (i=0;i<M;i++) {
p_res[i] = _mm512_subs_epi8(p_llrRes[i], p_bnProcBuf[18 + i]);
}
// Process group with 5 CNs
M = (1*Z + 63)>>6;
p_bnProcBuf = (__m512i*) &bnProcBuf [28416];
p_bnProcBufRes = (__m512i*) &bnProcBufRes [28416];
p_res = &p_bnProcBufRes[0];
p_llrRes = (__m512i*) &llrRes [9984];
for (i=0;i<M;i++) {
p_res[i] = _mm512_subs_epi8(p_llrRes[i], p_bnProcBuf[0 + i]);
}
p_res = &p_bnProcBufRes[6];
p_llrRes = (__m512i*) &llrRes [9984];
for (i=0;i<M;i++) {
p_res[i] = _mm512_subs_epi8(p_llrRes[i], p_bnProcBuf[6 + i]);
}
p_res = &p_bnProcBufRes[12];
p_llrRes = (__m512i*) &llrRes [9984];
for (i=0;i<M;i++) {
p_res[i] = _mm512_subs_epi8(p_llrRes[i], p_bnProcBuf[12 + i]);
}
p_res = &p_bnProcBufRes[18];
p_llrRes = (__m512i*) &llrRes [9984];
for (i=0;i<M;i++) {
p_res[i] = _mm512_subs_epi8(p_llrRes[i], p_bnProcBuf[18 + i]);
}
p_res = &p_bnProcBufRes[24];
p_llrRes = (__m512i*) &llrRes [9984];
for (i=0;i<M;i++) {
p_res[i] = _mm512_subs_epi8(p_llrRes[i], p_bnProcBuf[24 + i]);
}
// Process group with 6 CNs
// Process group with 7 CNs
// Process group with 8 CNs
// Process group with 9 CNs
// Process group with 10 CNs
// Process group with 11 CNs
// Process group with 12 CNs
// Process group with 13 CNs
// Process group with 14 CNs
// Process group with 15 CNs
// Process group with 16 CNs
// Process group with 17 CNs
// Process group with 18 CNs
// Process group with 19 CNs
// Process group with 20 CNs
// Process group with 21 CNs
// Process group with 22 CNs
// Process group with <23 CNs
// Process group with 24 CNs
// Process group with 25 CNs
// Process group with 26 CNs
// Process group with 27 CNs
// Process group with 28 CNs
// Process group with 29 CNs
// Process group with 30 CNs
}
#include <stdint.h>
#include <immintrin.h>
void nrLDPC_bnProc_BG2_R23_AVX2(int8_t* bnProcBuf,int8_t* bnProcBufRes, int8_t* llrRes, uint16_t Z ) {
__m256i* p_bnProcBuf;
__m256i* p_bnProcBufRes;
__m256i* p_llrRes;
__m256i* p_res;
void nrLDPC_bnProc_BG2_R23_AVX512(int8_t* bnProcBuf,int8_t* bnProcBufRes, int8_t* llrRes, uint16_t Z ) {
__m512i* p_bnProcBuf;
__m512i* p_bnProcBufRes;
__m512i* p_llrRes;
__m512i* p_res;
uint32_t M, i;
// Process group with 2 CNs
M = (3*Z + 31)>>5;
p_bnProcBuf = (__m256i*) &bnProcBuf [1152];
p_bnProcBufRes = (__m256i*) &bnProcBufRes [1152];
M = (3*Z + 63)>>6;
p_bnProcBuf = (__m512i*) &bnProcBuf [1152];
p_bnProcBufRes = (__m512i*) &bnProcBufRes [1152];
p_res = &p_bnProcBufRes[0];
p_llrRes = (__m256i*) &llrRes [1152];
p_llrRes = (__m512i*) &llrRes [1152];
for (i=0;i<M;i++) {
p_res[i] = _mm256_subs_epi8(p_llrRes[i], p_bnProcBuf[0 + i]);
p_res[i] = _mm512_subs_epi8(p_llrRes[i], p_bnProcBuf[0 + i]);
}
p_res = &p_bnProcBufRes[36];
p_llrRes = (__m256i*) &llrRes [1152];
p_res = &p_bnProcBufRes[18];
p_llrRes = (__m512i*) &llrRes [1152];
for (i=0;i<M;i++) {
p_res[i] = _mm256_subs_epi8(p_llrRes[i], p_bnProcBuf[36 + i]);
p_res[i] = _mm512_subs_epi8(p_llrRes[i], p_bnProcBuf[18 + i]);
}
// Process group with 3 CNs
M = (5*Z + 31)>>5;
p_bnProcBuf = (__m256i*) &bnProcBuf [3456];
p_bnProcBufRes = (__m256i*) &bnProcBufRes [3456];
M = (5*Z + 63)>>6;
p_bnProcBuf = (__m512i*) &bnProcBuf [3456];
p_bnProcBufRes = (__m512i*) &bnProcBufRes [3456];
p_res = &p_bnProcBufRes[0];
p_llrRes = (__m256i*) &llrRes [2304];
p_llrRes = (__m512i*) &llrRes [2304];
for (i=0;i<M;i++) {
p_res[i] = _mm256_subs_epi8(p_llrRes[i], p_bnProcBuf[0 + i]);
p_res[i] = _mm512_subs_epi8(p_llrRes[i], p_bnProcBuf[0 + i]);
}
p_res = &p_bnProcBufRes[60];
p_llrRes = (__m256i*) &llrRes [2304];
p_res = &p_bnProcBufRes[30];
p_llrRes = (__m512i*) &llrRes [2304];
for (i=0;i<M;i++) {
p_res[i] = _mm256_subs_epi8(p_llrRes[i], p_bnProcBuf[60 + i]);
p_res[i] = _mm512_subs_epi8(p_llrRes[i], p_bnProcBuf[30 + i]);
}
p_res = &p_bnProcBufRes[120];
p_llrRes = (__m256i*) &llrRes [2304];
p_res = &p_bnProcBufRes[60];
p_llrRes = (__m512i*) &llrRes [2304];
for (i=0;i<M;i++) {
p_res[i] = _mm256_subs_epi8(p_llrRes[i], p_bnProcBuf[120 + i]);
p_res[i] = _mm512_subs_epi8(p_llrRes[i], p_bnProcBuf[60 + i]);
}
// Process group with 4 CNs
M = (3*Z + 31)>>5;
p_bnProcBuf = (__m256i*) &bnProcBuf [9216];
p_bnProcBufRes = (__m256i*) &bnProcBufRes [9216];
M = (3*Z + 63)>>6;
p_bnProcBuf = (__m512i*) &bnProcBuf [9216];
p_bnProcBufRes = (__m512i*) &bnProcBufRes [9216];
p_res = &p_bnProcBufRes[0];
p_llrRes = (__m256i*) &llrRes [4224];
p_llrRes = (__m512i*) &llrRes [4224];
for (i=0;i<M;i++) {
p_res[i] = _mm256_subs_epi8(p_llrRes[i], p_bnProcBuf[0 + i]);
p_res[i] = _mm512_subs_epi8(p_llrRes[i], p_bnProcBuf[0 + i]);
}
p_res = &p_bnProcBufRes[36];
p_llrRes = (__m256i*) &llrRes [4224];
p_res = &p_bnProcBufRes[18];
p_llrRes = (__m512i*) &llrRes [4224];
for (i=0;i<M;i++) {
p_res[i] = _mm256_subs_epi8(p_llrRes[i], p_bnProcBuf[36 + i]);
p_res[i] = _mm512_subs_epi8(p_llrRes[i], p_bnProcBuf[18 + i]);
}
p_res = &p_bnProcBufRes[72];
p_llrRes = (__m256i*) &llrRes [4224];
p_res = &p_bnProcBufRes[36];
p_llrRes = (__m512i*) &llrRes [4224];
for (i=0;i<M;i++) {
p_res[i] = _mm256_subs_epi8(p_llrRes[i], p_bnProcBuf[72 + i]);
p_res[i] = _mm512_subs_epi8(p_llrRes[i], p_bnProcBuf[36 + i]);
}
p_res = &p_bnProcBufRes[108];
p_llrRes = (__m256i*) &llrRes [4224];
p_res = &p_bnProcBufRes[54];
p_llrRes = (__m512i*) &llrRes [4224];
for (i=0;i<M;i++) {
p_res[i] = _mm256_subs_epi8(p_llrRes[i], p_bnProcBuf[108 + i]);
p_res[i] = _mm512_subs_epi8(p_llrRes[i], p_bnProcBuf[54 + i]);
}
// Process group with 5 CNs
M = (2*Z + 31)>>5;
p_bnProcBuf = (__m256i*) &bnProcBuf [13824];
p_bnProcBufRes = (__m256i*) &bnProcBufRes [13824];
M = (2*Z + 63)>>6;
p_bnProcBuf = (__m512i*) &bnProcBuf [13824];
p_bnProcBufRes = (__m512i*) &bnProcBufRes [13824];
p_res = &p_bnProcBufRes[0];
p_llrRes = (__m256i*) &llrRes [5376];
p_llrRes = (__m512i*) &llrRes [5376];
for (i=0;i<M;i++) {
p_res[i] = _mm256_subs_epi8(p_llrRes[i], p_bnProcBuf[0 + i]);
p_res[i] = _mm512_subs_epi8(p_llrRes[i], p_bnProcBuf[0 + i]);
}
p_res = &p_bnProcBufRes[24];
p_llrRes = (__m256i*) &llrRes [5376];
p_res = &p_bnProcBufRes[12];
p_llrRes = (__m512i*) &llrRes [5376];
for (i=0;i<M;i++) {
p_res[i] = _mm256_subs_epi8(p_llrRes[i], p_bnProcBuf[24 + i]);
p_res[i] = _mm512_subs_epi8(p_llrRes[i], p_bnProcBuf[12 + i]);
}
p_res = &p_bnProcBufRes[48];
p_llrRes = (__m256i*) &llrRes [5376];
p_res = &p_bnProcBufRes[24];
p_llrRes = (__m512i*) &llrRes [5376];
for (i=0;i<M;i++) {
p_res[i] = _mm256_subs_epi8(p_llrRes[i], p_bnProcBuf[48 + i]);
p_res[i] = _mm512_subs_epi8(p_llrRes[i], p_bnProcBuf[24 + i]);
}
p_res = &p_bnProcBufRes[72];
p_llrRes = (__m256i*) &llrRes [5376];
p_res = &p_bnProcBufRes[36];
p_llrRes = (__m512i*) &llrRes [5376];
for (i=0;i<M;i++) {
p_res[i] = _mm256_subs_epi8(p_llrRes[i], p_bnProcBuf[72 + i]);
p_res[i] = _mm512_subs_epi8(p_llrRes[i], p_bnProcBuf[36 + i]);
}
p_res = &p_bnProcBufRes[96];
p_llrRes = (__m256i*) &llrRes [5376];
p_res = &p_bnProcBufRes[48];
p_llrRes = (__m512i*) &llrRes [5376];
for (i=0;i<M;i++) {
p_res[i] = _mm256_subs_epi8(p_llrRes[i], p_bnProcBuf[96 + i]);
p_res[i] = _mm512_subs_epi8(p_llrRes[i], p_bnProcBuf[48 + i]);
}
// Process group with 6 CNs
M = (1*Z + 31)>>5;
p_bnProcBuf = (__m256i*) &bnProcBuf [17664];
p_bnProcBufRes = (__m256i*) &bnProcBufRes [17664];
M = (1*Z + 63)>>6;
p_bnProcBuf = (__m512i*) &bnProcBuf [17664];
p_bnProcBufRes = (__m512i*) &bnProcBufRes [17664];
p_res = &p_bnProcBufRes[0];
p_llrRes = (__m256i*) &llrRes [6144];
p_llrRes = (__m512i*) &llrRes [6144];
for (i=0;i<M;i++) {
p_res[i] = _mm256_subs_epi8(p_llrRes[i], p_bnProcBuf[0 + i]);
p_res[i] = _mm512_subs_epi8(p_llrRes[i], p_bnProcBuf[0 + i]);
}
p_res = &p_bnProcBufRes[12];
p_llrRes = (__m256i*) &llrRes [6144];
p_res = &p_bnProcBufRes[6];
p_llrRes = (__m512i*) &llrRes [6144];
for (i=0;i<M;i++) {
p_res[i] = _mm256_subs_epi8(p_llrRes[i], p_bnProcBuf[12 + i]);
p_res[i] = _mm512_subs_epi8(p_llrRes[i], p_bnProcBuf[6 + i]);
}
p_res = &p_bnProcBufRes[24];
p_llrRes = (__m256i*) &llrRes [6144];
p_res = &p_bnProcBufRes[12];
p_llrRes = (__m512i*) &llrRes [6144];
for (i=0;i<M;i++) {
p_res[i] = _mm256_subs_epi8(p_llrRes[i], p_bnProcBuf[24 + i]);
p_res[i] = _mm512_subs_epi8(p_llrRes[i], p_bnProcBuf[12 + i]);
}
p_res = &p_bnProcBufRes[36];
p_llrRes = (__m256i*) &llrRes [6144];
p_res = &p_bnProcBufRes[18];
p_llrRes = (__m512i*) &llrRes [6144];
for (i=0;i<M;i++) {
p_res[i] = _mm256_subs_epi8(p_llrRes[i], p_bnProcBuf[36 + i]);
p_res[i] = _mm512_subs_epi8(p_llrRes[i], p_bnProcBuf[18 + i]);
}
p_res = &p_bnProcBufRes[48];
p_llrRes = (__m256i*) &llrRes [6144];
p_res = &p_bnProcBufRes[24];
p_llrRes = (__m512i*) &llrRes [6144];
for (i=0;i<M;i++) {
p_res[i] = _mm256_subs_epi8(p_llrRes[i], p_bnProcBuf[48 + i]);
p_res[i] = _mm512_subs_epi8(p_llrRes[i], p_bnProcBuf[24 + i]);
}
p_res = &p_bnProcBufRes[60];
p_llrRes = (__m256i*) &llrRes [6144];
p_res = &p_bnProcBufRes[30];
p_llrRes = (__m512i*) &llrRes [6144];
for (i=0;i<M;i++) {
p_res[i] = _mm256_subs_epi8(p_llrRes[i], p_bnProcBuf[60 + i]);
p_res[i] = _mm512_subs_epi8(p_llrRes[i], p_bnProcBuf[30 + i]);
}
// Process group with 7 CNs
// Process group with 8 CNs
......
Markdown is supported
0%
or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment