Commit a7bc2823 authored by Raymond Knopp's avatar Raymond Knopp Committed by Robert Schmidt

First-level of support for new aarch64 machines

Testing on Neoverse N1 (Ampere 3GHz). Tested only on a subset of phy-simulators.

- Changes:
- use SIMDE consistently
- adaptations of LDPC decoder generator for ARMv8 performance
- SIMDe modifications of Intel CRC to allow for aarch64 build.
  optimizations for 128-bit to improve performance of LDPC encode/decode
  on aarch64 (Neoverse 1)
- added BG2 files for 128-bit ldpc encoder (aarch64)
- testing on Xeon
- testing on x86
- minor changes to build/run on x86
- change in crc.h after returning to aarch64
- removed some warning in ldpc decoder generator for x86_64
- char
- Delete irrelevant constants
- Correctly declare variables
- Define stdbool for all architectures
- Remove definition of _MM_SHUFFLE and use SIMDE_MM_SHUFFLE
- Remove commented code
- Fix CMakeLists.txt
- Include SIMDE avx2 functions in tools_defs.h
parent 79fd37b4
......@@ -102,21 +102,20 @@ add_list_string_option(CMAKE_BUILD_TYPE "RelWithDebInfo" "Choose the type of bui
# in case /proc/cpuinfo exists we want to inspect available Intrinsics
# -so not to go always through SIMDE emulation
# -so to avoid AVX512 instructions generation by gcc
execute_process(COMMAND uname -m OUTPUT_VARIABLE CPUARCH OUTPUT_STRIP_TRAILING_WHITESPACE)
message(STATUS "CPUARCH ${CPUARCH}")
if(EXISTS "/proc/cpuinfo")
file(STRINGS "/proc/cpuinfo" CPUFLAGS REGEX flags LIMIT_COUNT 1)
else()
message(WARNING "did not find /proc/cpuinfo -- not setting any x86-specific compilation variables")
endif()
eval_boolean(AUTODETECT_AVX512 DEFINED CPUFLAGS AND CPUFLAGS MATCHES "avx512")
add_boolean_option(AVX512 ${AUTODETECT_AVX512} "Whether AVX512 intrinsics is available on the host processor" ON)
eval_boolean(AUTODETECT_AVX2 DEFINED CPUFLAGS AND CPUFLAGS MATCHES "avx2")
add_boolean_option(AVX2 ${AUTODETECT_AVX2} "Whether AVX2 intrinsics is available on the host processor" ON)
if(${CPUARCH} STREQUAL "x86_64" AND DEFINED CPUFLAGS)
message(STATUS "CPU architecture is ${CMAKE_SYSTEM_PROCESSOR}")
if(CMAKE_SYSTEM_PROCESSOR STREQUAL "x86_64")
# The following intrinsics are assumed to be available on any x86 system
# (avx, f16c, fma, gnfi, mmx, pclmul, sse, sse2, sse3, xop)
set(C_FLAGS_PROCESSOR "${C_FLAGS_PROCESSOR} -DSIMDE_X86_AVX_NATIVE -DSIMDE_X86_AVX_NATIVE -DSIMDE_X86_F16C_NATIVE -DSIMDE_X86_FMA_NATIVE -DSIMDE_X86_GFNI_NATIVE -DSIMDE_X86_MMX_NATIVE -DSIMDE_X86_PCLMUL_NATIVE -DSIMDE_X86_SSE2_NATIVE -DSIMDE_X86_SSE3_NATIVE -DSIMDE_X86_SSE_NATIVE -DSIMDE_X86_XOP_HAVE_COM_ -DSIMDE_X86_XOP_NATIVE")
......@@ -139,8 +138,12 @@ if(${CPUARCH} STREQUAL "x86_64" AND DEFINED CPUFLAGS)
if(CPUINFO MATCHES "ssse3")
set(C_FLAGS_PROCESSOR "${C_FLAGS_PROCESSOR} -DSIMDE_X86_SSSE3_NATIVE")
endif()
elseif(${CPUARCH} NOT STREQUAL "x86_64")
message(FATAL_ERROR "Cannot compile for CPU architecture ${CPUARCH}")
elseif (CMAKE_SYSTEM_PROCESSOR STREQUAL "armv7l")
set(C_FLAGS_PROCESSOR "${C_FLAGS_PROCESSOR} -gdwarf-2 -mfloat-abi=hard -mfpu=neon -lgcc -lrt")
elseif (CMAKE_SYSTEM_PROCESSOR STREQUAL "aarch64")
set(C_FLAGS_PROCESSOR "${C_FLAGS_PROCESSOR} -gdwarf-2 -lgcc -lrt")
else()
message(FATAL_ERROR "compile for CPU architecture ${CPUARCH}, CMAKE_SYSTEM_PROCESSOR ${CMAKE_SYSTEM_PROCESSOR}")
endif()
set(C_FLAGS_PROCESSOR "${C_FLAGS_PROCESSOR} -march=native")
......@@ -856,7 +859,7 @@ target_link_libraries(ldpc_parityCheck PRIVATE ldpc_gen_HEADERS)
add_library(coding MODULE ${PHY_TURBOSRC} )
add_library(dfts MODULE ${OPENAIR1_DIR}/PHY/TOOLS/oai_dfts.c )
add_library(dfts MODULE ${OPENAIR1_DIR}/PHY/TOOLS/oai_dfts.c ${OPENAIR1_DIR}/PHY/TOOLS/oai_dfts_neon.c)
set(PHY_SRC_COMMON
......
......@@ -52,7 +52,11 @@
// Fixme: a better place to be shure it is called
void read_cpu_hardware (void) __attribute__ ((constructor));
void read_cpu_hardware (void) {__builtin_cpu_init(); }
#if !defined(__arm__) && !defined(__aarch64__)
void read_cpu_hardware (void) {__builtin_cpu_init(); }
#else
void read_cpu_hardware (void) {}
#endif
log_mem_cnt_t log_mem_d[2];
int log_mem_flag = 0;
......
......@@ -38,7 +38,7 @@ static time_stats_t **measur_table;
notifiedFIFO_t measur_fifo;
double get_cpu_freq_GHz(void)
{
if (cpu_freq_GHz <1 ) {
if (cpu_freq_GHz <0.01 ) {
time_stats_t ts = {0};
reset_meas(&ts);
ts.trials++;
......@@ -46,8 +46,7 @@ double get_cpu_freq_GHz(void)
sleep(1);
ts.diff = (rdtsc_oai()-ts.in);
cpu_freq_GHz = (double)ts.diff/1000000000;
printf("CPU Freq is %f \n", cpu_freq_GHz);
}
}
return cpu_freq_GHz;
}
......
......@@ -106,8 +106,16 @@ static inline unsigned long long rdtsc_oai(void) {
__asm__ volatile ("rdtsc" : "=a" (a), "=d" (d));
return (d<<32) | a;
}
#elif defined(__aarch64__)
static inline uint64_t rdtsc_oai(void) __attribute__((always_inline));
static inline uint64_t rdtsc_oai(void)
{
uint64_t r = 0;
asm volatile("mrs %0, cntvct_el0" : "=r"(r));
return r;
}
#elif defined(__arm__) || defined(__aarch64__)
#elif defined(__arm__)
static inline uint32_t rdtsc_oai(void) __attribute__((always_inline));
static inline uint32_t rdtsc_oai(void) {
uint32_t r = 0;
......
......@@ -118,12 +118,6 @@ char *itoa(int i) {
return strdup(buffer);
}
void *memcpy1(void *dst,const void *src,size_t n) {
void *ret=dst;
asm volatile("rep movsb" : "+D" (dst) : "c"(n), "S"(src) : "cc","memory");
return(ret);
}
void set_priority(int priority)
{
......
......@@ -102,8 +102,6 @@ int hex_char_to_hex_value (char c);
// Converts an hexadecimal ASCII coded string into its value.**
int hex_string_to_hex_value (uint8_t *hex_value, const char *hex_string, int size);
void *memcpy1(void *dst,const void *src,size_t n);
void set_priority(int priority);
char *itoa(int i);
......
This diff is collapsed.
This source diff could not be displayed because it is too large. You can view the blob instead.
......@@ -34,9 +34,9 @@ void exit_function(const char* file, const char* function, const int line, const
exit(-1);
}
signed char quantize(double D, double x, unsigned char B) {
int8_t quantize(double D, double x, uint8_t B) {
double qxd;
short maxlev;
int16_t maxlev;
qxd = floor(x / D);
maxlev = 1 << (B - 1); //(char)(pow(2,B-1));
......@@ -45,7 +45,7 @@ signed char quantize(double D, double x, unsigned char B) {
else if (qxd >= maxlev)
qxd = maxlev - 1;
return ((char) qxd);
return ((int8_t) qxd);
}
......
......@@ -23,6 +23,7 @@
#include <math.h>
#include <stdio.h>
#include <string.h>
#include <stdint.h>
#include "assertions.h"
#include "SIMULATION/TOOLS/sim.h"
#include "common/utils/load_module_shlib.h"
......@@ -43,7 +44,7 @@
#define NR_LDPC_ENABLE_PARITY_CHECK
// 4-bit quantizer
char quantize4bit(double D,double x)
int8_t quantize4bit(double D,double x)
{
double qxd;
qxd = floor(x/D);
......@@ -54,13 +55,13 @@ char quantize4bit(double D,double x)
else if (qxd > 7)
qxd = 7;
return((char)qxd);
return((int8_t)qxd);
}
char quantize8bit(double D,double x)
int8_t quantize8bit(double D,double x)
{
double qxd;
//char maxlev;
//int8_t maxlev;
qxd = floor(x/D);
//maxlev = 1<<(B-1);
......@@ -72,7 +73,7 @@ char quantize8bit(double D,double x)
else if (qxd >= 128)
qxd = 127;
return((char)qxd);
return((int8_t)qxd);
}
typedef struct {
......@@ -92,7 +93,7 @@ int test_ldpc(short max_iterations,
int nom_rate,
int denom_rate,
double SNR,
unsigned char qbits,
uint8_t qbits,
short block_length,
unsigned int ntrials,
int n_segments,
......@@ -117,15 +118,15 @@ int test_ldpc(short max_iterations,
sigma = 1.0/sqrt(2*SNR);
opp_enabled=1;
//short test_input[block_length];
unsigned char *test_input[MAX_NUM_NR_DLSCH_SEGMENTS_PER_LAYER*NR_MAX_NB_LAYERS]={NULL,NULL,NULL,NULL,NULL,NULL,NULL,NULL};;
uint8_t *test_input[MAX_NUM_NR_DLSCH_SEGMENTS_PER_LAYER*NR_MAX_NB_LAYERS]={NULL,NULL,NULL,NULL,NULL,NULL,NULL,NULL};;
//short *c; //padded codeword
unsigned char estimated_output[MAX_NUM_DLSCH_SEGMENTS][block_length];
uint8_t estimated_output[MAX_NUM_DLSCH_SEGMENTS][block_length];
memset(estimated_output, 0, sizeof(estimated_output));
unsigned char *channel_input[MAX_NUM_DLSCH_SEGMENTS];
unsigned char *channel_input_optim[MAX_NUM_DLSCH_SEGMENTS];
uint8_t *channel_input[MAX_NUM_DLSCH_SEGMENTS];
uint8_t *channel_input_optim[MAX_NUM_DLSCH_SEGMENTS];
//double channel_output[68 * 384];
double modulated_input[MAX_NUM_DLSCH_SEGMENTS][68 * 384] = { 0 };
char channel_output_fixed[MAX_NUM_DLSCH_SEGMENTS][68 * 384] = { 0 };
int8_t channel_output_fixed[MAX_NUM_DLSCH_SEGMENTS][68 * 384] = { 0 };
short BG=0,nrows=0;//,ncols;
int no_punctured_columns,removed_bit;
int i1,Zc,Kb=0;
......@@ -149,12 +150,12 @@ int test_ldpc(short max_iterations,
// generate input block
for(int j=0;j<MAX_NUM_DLSCH_SEGMENTS;j++) {
test_input[j]=(unsigned char *)malloc16(sizeof(unsigned char) * block_length/8);
memset(test_input[j], 0, sizeof(unsigned char) * block_length / 8);
channel_input[j] = (unsigned char *)malloc16(sizeof(unsigned char) * 68*384);
memset(channel_input[j], 0, sizeof(unsigned char) * 68 * 384);
channel_input_optim[j] = (unsigned char *)malloc16(sizeof(unsigned char) * 68*384);
memset(channel_input_optim[j], 0, sizeof(unsigned char) * 68 * 384);
test_input[j]=(uint8_t *)malloc16(sizeof(uint8_t) * block_length/8);
memset(test_input[j], 0, sizeof(uint8_t) * block_length / 8);
channel_input[j] = (uint8_t *)malloc16(sizeof(uint8_t) * 68*384);
memset(channel_input[j], 0, sizeof(uint8_t) * 68 * 384);
channel_input_optim[j] = (uint8_t *)malloc16(sizeof(uint8_t) * 68*384);
memset(channel_input_optim[j], 0, sizeof(uint8_t) * 68 * 384);
}
reset_meas(&time);
......@@ -179,7 +180,7 @@ int test_ldpc(short max_iterations,
for (int j=0;j<MAX_NUM_DLSCH_SEGMENTS;j++) {
for (int i=0; i<block_length/8; i++) {
test_input[j][i]=(unsigned char) rand();
test_input[j][i]=(uint8_t) rand();
//test_input[j][i]=j%256;
//test_input[j][i]=252;
}
......@@ -265,7 +266,7 @@ int test_ldpc(short max_iterations,
removed_bit=(nrows-no_punctured_columns-2) * Zc+block_length-(int)(block_length/((float)nom_rate/(float)denom_rate));
encoder_implemparams_t impp=INIT0_LDPCIMPLEMPARAMS;
impp.gen_code=1;
impp.gen_code = 2;
if (ntrials==0)
encoder_orig(test_input,channel_input, Zc, BG, block_length, BG, &impp);
impp.gen_code=0;
......@@ -327,21 +328,21 @@ int test_ldpc(short max_iterations,
modulated_input[j][i]=-1.0;///sqrt(2);
///channel_output[i] = modulated_input[i] + gaussdouble(0.0,1.0) * 1/sqrt(2*SNR);
//channel_output_fixed[i] = (char) ((channel_output[i]*128)<0?(channel_output[i]*128-0.5):(channel_output[i]*128+0.5)); //fixed point 9-7
//channel_output_fixed[i] = (int8_t) ((channel_output[i]*128)<0?(channel_output[i]*128-0.5):(channel_output[i]*128+0.5)); //fixed point 9-7
//printf("llr[%d]=%d\n",i,channel_output_fixed[i]);
//channel_output_fixed[i] = (char)quantize(sigma/4.0,(2.0*modulated_input[i]) - 1.0 + sigma*gaussdouble(0.0,1.0),qbits);
channel_output_fixed[j][i] = (char)quantize(sigma/4.0/4.0,modulated_input[j][i] + sigma*gaussdouble(0.0,1.0),qbits);
//channel_output_fixed[i] = (char)quantize8bit(sigma/4.0,(2.0*modulated_input[i]) - 1.0 + sigma*gaussdouble(0.0,1.0));
//channel_output_fixed[i] = (int8_t)quantize(sigma/4.0,(2.0*modulated_input[i]) - 1.0 + sigma*gaussdouble(0.0,1.0),qbits);
channel_output_fixed[j][i] = (int8_t)quantize(sigma/4.0/4.0,modulated_input[j][i] + sigma*gaussdouble(0.0,1.0),qbits);
//channel_output_fixed[i] = (int8_t)quantize8bit(sigma/4.0,(2.0*modulated_input[i]) - 1.0 + sigma*gaussdouble(0.0,1.0));
//printf("llr[%d]=%d\n",i,channel_output_fixed[i]);
//printf("channel_output_fixed[%d]: %d\n",i,channel_output_fixed[i]);
//Uncoded BER
unsigned char channel_output_uncoded = channel_output_fixed[j][i]<0 ? 1 /* QPSK demod */ : 0;
uint8_t channel_output_uncoded = channel_output_fixed[j][i]<0 ? 1 /* QPSK demod */ : 0;
if (channel_output_uncoded != channel_input_optim[j][i-2*Zc])
*errors_bit_uncoded = (*errors_bit_uncoded) + 1;
*errors_bit_uncoded = (*errors_bit_uncoded) + 1;
}
......@@ -373,8 +374,8 @@ int test_ldpc(short max_iterations,
}
for (int i=0; i<block_length; i++)
{
unsigned char estoutputbit = (estimated_output[j][i/8]&(1<<(i&7)))>>(i&7);
unsigned char inputbit = (test_input[j][i/8]&(1<<(i&7)))>>(i&7); // Further correct for multiple segments
uint8_t estoutputbit = (estimated_output[j][i/8]&(1<<(i&7)))>>(i&7);
uint8_t inputbit = (test_input[j][i/8]&(1<<(i&7)))>>(i&7); // Further correct for multiple segments
if (estoutputbit != inputbit)
*errors_bit = (*errors_bit) + 1;
}
......@@ -445,7 +446,7 @@ int main(int argc, char *argv[])
int nom_rate=1;
int denom_rate=3;
double SNR0=-2.0,SNR,SNR_lin;
unsigned char qbits=8;
uint8_t qbits=8;
unsigned int decoded_errors[10000]; // initiate the size of matrix equivalent to size of SNR
int c,i=0, i1 = 0;
......
......@@ -116,7 +116,7 @@ void lte_param_init(unsigned char N_tx, unsigned char N_rx,unsigned char transmi
}
/*
void print_shorts(char *s,__m128i *x) {
void print_shorts(char *s,simde__m128i *x) {
short *tempb = (short *)x;
......
......@@ -39,9 +39,6 @@
#ifndef __CRC_H__
#define __CRC_H__
#include <x86intrin.h>
#include "crcext.h"
#include "types.h"
#include "PHY/sse_intrin.h"
......@@ -305,14 +302,20 @@ uint32_t crc32_calc_slice4(const uint8_t *data,
* @return New 16 byte folded data
*/
__forceinline
__m128i crc32_folding_round(const __m128i data_block,
const __m128i k1_k2,
const __m128i fold)
simde__m128i crc32_folding_round(const simde__m128i data_block,
const simde__m128i k1_k2,
const simde__m128i fold)
{
__m128i tmp = _mm_clmulepi64_si128(fold, k1_k2, 0x11);
#ifdef __x86_64__
simde__m128i tmp = _mm_clmulepi64_si128(fold, k1_k2, 0x11);
return simde_mm_xor_si128(_mm_clmulepi64_si128(fold, k1_k2, 0x00), simde_mm_xor_si128(data_block, tmp));
#else
simde__m128i tmp = simde_mm_clmulepi64_si128(fold, k1_k2, 0x11);
return _mm_xor_si128(_mm_clmulepi64_si128(fold, k1_k2, 0x00),
_mm_xor_si128(data_block, tmp));
return simde_mm_xor_si128(simde_mm_clmulepi64_si128(fold, k1_k2, 0x00),
simde_mm_xor_si128(data_block, tmp));
#endif
}
/**
......@@ -324,17 +327,23 @@ __m128i crc32_folding_round(const __m128i data_block,
* @return data reduced to 64 bits
*/
__forceinline
__m128i crc32_reduce_128_to_64(__m128i data128, const __m128i k3_q)
simde__m128i crc32_reduce_128_to_64(simde__m128i data128, const simde__m128i k3_q)
{
__m128i tmp;
simde__m128i tmp;
tmp = _mm_xor_si128(_mm_clmulepi64_si128(data128, k3_q, 0x01 /* k3 */),
#ifdef __x86_64__
tmp = simde_mm_xor_si128(_mm_clmulepi64_si128(data128, k3_q, 0x01 /* k3 */),
data128);
data128 = _mm_xor_si128(_mm_clmulepi64_si128(tmp, k3_q, 0x01 /* k3 */),
data128 = simde_mm_xor_si128(_mm_clmulepi64_si128(tmp, k3_q, 0x01 /* k3 */),
data128);
#else
tmp = simde_mm_xor_si128(simde_mm_clmulepi64_si128(data128, k3_q, 0x01 /* k3 */), data128);
data128 = simde_mm_xor_si128(simde_mm_clmulepi64_si128(tmp, k3_q, 0x01 /* k3 */), data128);
return _mm_srli_si128(_mm_slli_si128(data128, 8), 8);
#endif
return simde_mm_srli_si128(simde_mm_slli_si128(data128, 8), 8);
}
/**
......@@ -348,15 +357,22 @@ __m128i crc32_reduce_128_to_64(__m128i data128, const __m128i k3_q)
*/
__forceinline
uint32_t
crc32_reduce_64_to_32(__m128i fold, const __m128i k3_q, const __m128i p_res)
crc32_reduce_64_to_32(simde__m128i fold, const simde__m128i k3_q, const simde__m128i p_res)
{
__m128i temp;
temp = _mm_clmulepi64_si128(_mm_srli_si128(fold, 4),
simde__m128i temp;
#ifdef __x86_64__
temp = _mm_clmulepi64_si128(simde_mm_srli_si128(fold, 4),
k3_q, 0x10 /* Q */);
temp = _mm_srli_si128(_mm_xor_si128(temp, fold), 4);
temp = simde_mm_srli_si128(simde_mm_xor_si128(temp, fold), 4);
temp = _mm_clmulepi64_si128(temp, p_res, 0 /* P */);
return _mm_extract_epi32(_mm_xor_si128(temp, fold), 0);
#else
temp = simde_mm_clmulepi64_si128(simde_mm_srli_si128(fold, 4),
k3_q, 0x10 /* Q */);
temp = simde_mm_srli_si128(simde_mm_xor_si128(temp, fold), 4);
temp = simde_mm_clmulepi64_si128(temp, p_res, 0 /* P */);
#endif
return simde_mm_extract_epi32(simde_mm_xor_si128(temp, fold), 0);
}
/**
......@@ -379,7 +395,7 @@ crc32_calc_pclmulqdq(const uint8_t *data,
uint32_t data_len, uint32_t crc,
const struct crc_pclmulqdq_ctx *params)
{
__m128i temp, fold, k, swap;
simde__m128i temp, fold, k, swap;
uint32_t n;
if (unlikely(data == NULL || data_len == 0 || params == NULL))
......@@ -405,7 +421,7 @@ crc32_calc_pclmulqdq(const uint8_t *data,
* Load first 16 data bytes in \a fold and
* set \a swap BE<->LE 16 byte conversion variable
*/
fold = _mm_loadu_si128((__m128i *)data);
fold = simde_mm_loadu_si128((simde__m128i *)data);
swap = crc_xmm_be_le_swap128;
/**
......@@ -420,20 +436,20 @@ crc32_calc_pclmulqdq(const uint8_t *data,
* - adjust data block
* - 4 least significant bytes need to be zero
*/
fold = _mm_shuffle_epi8(fold, swap);
fold = _mm_slli_si128(xmm_shift_right(fold, 20 - data_len), 4);
fold = simde_mm_shuffle_epi8(fold, swap);
fold = simde_mm_slli_si128(xmm_shift_right(fold, 20 - data_len), 4);
/**
* Apply CRC init value
*/
temp = _mm_insert_epi32(_mm_setzero_si128(), bswap4(crc), 0);
temp = simde_mm_insert_epi32(simde_mm_setzero_si128(), bswap4(crc), 0);
temp = xmm_shift_left(temp, data_len - 4);
fold = _mm_xor_si128(fold, temp);
fold = simde_mm_xor_si128(fold, temp);
} else {
/**
* There are 2x16 data blocks or more
*/
__m128i next_data;
simde__m128i next_data;
/**
* n = number of bytes required to align \a data_len
......@@ -445,10 +461,10 @@ crc32_calc_pclmulqdq(const uint8_t *data,
* Apply CRC initial value and
* get \a fold to BE format
*/
fold = _mm_xor_si128(fold,
_mm_insert_epi32(_mm_setzero_si128(),
fold = simde_mm_xor_si128(fold,
simde_mm_insert_epi32(simde_mm_setzero_si128(),
crc, 0));
fold = _mm_shuffle_epi8(fold, swap);
fold = simde_mm_shuffle_epi8(fold, swap);
/**
* Load next 16 bytes of data and
......@@ -456,9 +472,9 @@ crc32_calc_pclmulqdq(const uint8_t *data,
*
* CONCAT(fold,next_data) >> (n*8)
*/
next_data = _mm_loadu_si128((__m128i *)&data[16]);
next_data = _mm_shuffle_epi8(next_data, swap);
next_data = _mm_or_si128(xmm_shift_right(next_data, n),
next_data = simde_mm_loadu_si128((simde__m128i *)&data[16]);
next_data = simde_mm_shuffle_epi8(next_data, swap);
next_data = simde_mm_or_si128(xmm_shift_right(next_data, n),
xmm_shift_left(fold, 16 - n));
fold = xmm_shift_right(fold, n);
......@@ -467,12 +483,12 @@ crc32_calc_pclmulqdq(const uint8_t *data,
* In such unlikely case clear 4 least significant bytes
*/
next_data =
_mm_slli_si128(_mm_srli_si128(next_data, 4), 4);
simde_mm_slli_si128(simde_mm_srli_si128(next_data, 4), 4);
/**
* Do the initial folding round on 2 first 16 byte chunks
*/
k = _mm_load_si128((__m128i *)(&params->k1));
k = simde_mm_load_si128((simde__m128i *)(&params->k1));
fold = crc32_folding_round(next_data, k, fold);
if (likely(data_len > 32)) {
......@@ -480,7 +496,7 @@ crc32_calc_pclmulqdq(const uint8_t *data,
* \a data_block needs to be at least 48 bytes long
* in order to get here
*/
__m128i new_data;
simde__m128i new_data;
/**
* Main folding loop
......@@ -493,8 +509,8 @@ crc32_calc_pclmulqdq(const uint8_t *data,
* - the last 16 bytes is processed separately
*/
for (n = 16 + 16 - n; n < (data_len - 16); n += 16) {
new_data = _mm_loadu_si128((__m128i *)&data[n]);
new_data = _mm_shuffle_epi8(new_data, swap);
new_data = simde_mm_loadu_si128((simde__m128i *)&data[n]);
new_data = simde_mm_shuffle_epi8(new_data, swap);
fold = crc32_folding_round(new_data, k, fold);
}
......@@ -504,9 +520,9 @@ crc32_calc_pclmulqdq(const uint8_t *data,
* Read from offset -4 is to avoid one
* shift right operation.
*/
new_data = _mm_loadu_si128((__m128i *)&data[n - 4]);
new_data = _mm_shuffle_epi8(new_data, swap);
new_data = _mm_slli_si128(new_data, 4);
new_data = simde_mm_loadu_si128((simde__m128i *)&data[n - 4]);
new_data = simde_mm_shuffle_epi8(new_data, swap);
new_data = simde_mm_slli_si128(new_data, 4);
fold = crc32_folding_round(new_data, k, fold);
} /* if (data_len > 32) */
}
......@@ -520,14 +536,14 @@ crc32_calc_pclmulqdq(const uint8_t *data,
/**
* REDUCTION 128 -> 64
*/
k = _mm_load_si128((__m128i *)(&params->k3));
k = simde_mm_load_si128((simde__m128i *)(&params->k3));
fold = crc32_reduce_128_to_64(fold, k);
/**
* REDUCTION 64 -> 32
*/
n = crc32_reduce_64_to_32(fold, k,
_mm_load_si128((__m128i *)(&params->p)));
simde_mm_load_si128((simde__m128i *)(&params->p)));
#ifdef __KERNEL__
/**
......
......@@ -30,17 +30,15 @@
Modified in June, 2001, to include the length non multiple of 8
*/
#ifndef __SSE4_1__
#if !defined(__SSE4_1__) && !defined(__aarch64__)
#define USE_INTEL_CRC 0
#else
#define USE_INTEL_CRC __SSE4_1__
#define USE_INTEL_CRC 1
#include "crc.h"
#endif
#include "coding_defs.h"
#include "assertions.h"
#if USE_INTEL_CRC
#include "crc.h"
#endif
/*ref 36-212 v8.6.0 , pp 8-9 */
/* the highest degree is set by default */
......@@ -103,14 +101,14 @@ static uint32_t crc6Table[256];
#if USE_INTEL_CRC
static const struct crc_pclmulqdq_ctx lte_crc24a_pclmulqdq __attribute__((aligned(16))) = {
0x64e4d700, /**< k1 */
0x2c8c9d00, /**< k2 */
0xd9fe8c00, /**< k3 */
0xf845fe24, /**< q */
0x864cfb00, /**< p */
0ULL /**< res */
0x64e4d700, /**< k1 */
0x2c8c9d00, /**< k2 */
0xd9fe8c00, /**< k3 */
0xf845fe24, /**< q */
0x864cfb00, /**< p */
0ULL /**< res */
};
__m128i crc_xmm_be_le_swap128;
simde__m128i crc_xmm_be_le_swap128;
const uint8_t crc_xmm_shift_tab[48]
__attribute__((aligned(16))) = {0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff,
......@@ -133,9 +131,9 @@ void crcTableInit (void)
crc8Table[c] = crcbit(&c, 1, poly8) >> 24;
crc6Table[c] = crcbit(&c, 1, poly6) >> 24;
} while (++c);
#if USE_INTEL_CRC
crc_xmm_be_le_swap128 = _mm_setr_epi32(0x0c0d0e0f, 0x08090a0b,
0x04050607, 0x00010203);
#if defined(__SSE4_1__) || defined(__aarch64__)
crc_xmm_be_le_swap128 = simde_mm_setr_epi32(0x0c0d0e0f, 0x08090a0b,
0x04050607, 0x00010203);
#endif
}
......@@ -164,7 +162,7 @@ uint32_t crc24a(unsigned char* inptr, int bitlen)
crc = (crc << resbit) ^ crc24aTable[((*inptr) >> (8 - resbit)) ^ (crc >> (32 - resbit))];
return crc;
}
#if USE_INTEL_CRC
#if defined(__SSE4_1__) || defined(__aarch64__)
else {
return crc32_calc_pclmulqdq(inptr, octetlen, 0,
&lte_crc24a_pclmulqdq);
......
......@@ -33,7 +33,7 @@
#ifndef __CRCEXT_H__
#define __CRCEXT_H__
#include <x86intrin.h>
#include "PHY/sse_intrin.h"
#include "types.h"
/**
* Flag indicating availability of PCLMULQDQ instruction
......@@ -45,7 +45,7 @@ extern int pclmulqdq_available;
* Flag indicating availability of PCLMULQDQ instruction
* Only valid after running CRCInit() function.
*/
extern __m128i crc_xmm_be_le_swap128;
extern simde__m128i crc_xmm_be_le_swap128;
extern const uint8_t crc_xmm_shift_tab[48];
/**
......@@ -57,11 +57,11 @@ extern const uint8_t crc_xmm_shift_tab[48];
* @return \a reg >> (\a num * 8)
*/
__forceinline
__m128i xmm_shift_right(__m128i reg, const unsigned int num)
simde__m128i xmm_shift_right(simde__m128i reg, const unsigned int num)
{
const __m128i *p = (const __m128i *)(crc_xmm_shift_tab + 16 + num);
const simde__m128i *p = (const simde__m128i *)(crc_xmm_shift_tab + 16 + num);
return _mm_shuffle_epi8(reg, _mm_loadu_si128(p));
return simde_mm_shuffle_epi8(reg, simde_mm_loadu_si128(p));
}
/**
......@@ -73,11 +73,11 @@ __m128i xmm_shift_right(__m128i reg, const unsigned int num)
* @return \a reg << (\a num * 8)
*/
__forceinline
__m128i xmm_shift_left(__m128i reg, const unsigned int num)
simde__m128i xmm_shift_left(simde__m128i reg, const unsigned int num)
{
const __m128i *p = (const __m128i *)(crc_xmm_shift_tab + 16 - num);
const simde__m128i *p = (const simde__m128i *)(crc_xmm_shift_tab + 16 - num);
return _mm_shuffle_epi8(reg, _mm_loadu_si128(p));
return simde_mm_shuffle_epi8(reg, simde_mm_loadu_si128(p));
}
/**
......
......@@ -7,13 +7,17 @@ add_subdirectory(${CMAKE_CURRENT_LIST_DIR}/generator_cnProc_avx512 ldpc/generato
add_custom_target(ldpc_generators)
add_dependencies(ldpc_generators
bnProc_gen_avx2
bnProc_gen_128
bnProc_gen_avx512
cnProc_gen_avx2
cnProc_gen_128
cnProc_gen_avx512)
add_library(ldpc_gen_HEADERS INTERFACE)
target_link_libraries(ldpc_gen_HEADERS INTERFACE
bnProc_gen_avx2_HEADERS
bnProc_gen_128_HEADERS
bnProc_gen_avx512_HEADERS
cnProc_gen_avx2_HEADERS
cnProc_gen_128_HEADERS
cnProc_gen_avx512_HEADERS)
......@@ -4,8 +4,16 @@ add_executable(bnProc_gen_avx2
bnProcPc_gen_BG1_avx2.c
bnProcPc_gen_BG2_avx2.c
main.c)
target_compile_options(bnProc_gen_avx2 PRIVATE -W -Wall -mavx2)
add_executable(bnProc_gen_128
bnProc_gen_BG1_128.c
bnProc_gen_BG2_128.c
bnProcPc_gen_BG1_128.c
bnProcPc_gen_BG2_128.c
main128.c)
target_compile_options(bnProc_gen_avx2 PRIVATE -W -Wall )
target_compile_options(bnProc_gen_128 PRIVATE -W -Wall )
#set(bnProc_headers
# bnProc/nrLDPC_bnProc_BG1_R13_AVX2.h
# bnProc/nrLDPC_bnProc_BG1_R23_AVX2.h
......@@ -30,7 +38,18 @@ add_custom_command(TARGET bnProc_gen_avx2 POST_BUILD
DEPENDS bnProc_gen_avx2
COMMENT "Generating LDPC bnProc header files for AVX2"
)
add_custom_command(TARGET bnProc_gen_128 POST_BUILD
#OUTPUT ${bnProc_headers} ${bnProcPc_headers}
COMMAND ${CMAKE_COMMAND} -E make_directory bnProc128
COMMAND ${CMAKE_COMMAND} -E make_directory bnProcPc128
COMMAND bnProc_gen_128 .
DEPENDS bnProc_gen_128
COMMENT "Generating LDPC bnProc header files for 128-bit SIMD"
)
add_library(bnProc_gen_avx2_HEADERS INTERFACE)
target_include_directories(bnProc_gen_avx2_HEADERS INTERFACE ${CMAKE_CURRENT_BINARY_DIR})
add_dependencies(bnProc_gen_avx2_HEADERS bnProc_gen_avx2)
add_library(bnProc_gen_128_HEADERS INTERFACE)
target_include_directories(bnProc_gen_128_HEADERS INTERFACE ${CMAKE_CURRENT_BINARY_DIR})
add_dependencies(bnProc_gen_128_HEADERS bnProc_gen_128)
/*
* Licensed to the OpenAirInterface (OAI) Software Alliance under one or more
* contributor license agreements. See the NOTICE file distributed with
* this work for additional information regarding copyright ownership.
* The OpenAirInterface Software Alliance licenses this file to You under
* the OAI Public License, Version 1.1 (the "License"); you may not use this file
* except in compliance with the License.
* You may obtain a copy of the License at
*
* http://www.openairinterface.org/?page_id=698
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*-------------------------------------------------------------------------------
* For more information about the OpenAirInterface (OAI) Software Alliance:
* contact@openairinterface.org
*/
#include <stdio.h>
#include <stdlib.h>
#include <stdint.h>
#include "../../nrLDPCdecoder_defs.h"
#include "../../nrLDPC_types.h"
void nrLDPC_bnProcPc_BG1_generator_128(const char *dir, int R)
{
const char *ratestr[3]={"13","23","89"};
if (R<0 || R>2) {printf("Illegal R %d\n",R); abort();}
// system("mkdir -p ../ldpc_gen_files");
char fname[FILENAME_MAX+1];
snprintf(fname, sizeof(fname), "%s/bnProcPc128/nrLDPC_bnProcPc_BG1_R%s_128.h", dir, ratestr[R]);
FILE *fd=fopen(fname,"w");
if (fd == NULL) {
printf("Cannot create file %s\n", fname);
abort();
}
fprintf(fd,"#include <stdint.h>\n");
fprintf(fd,"#include \"PHY/sse_intrin.h\"\n");
fprintf(fd,"static inline void nrLDPC_bnProcPc_BG1_R%s_128(int8_t* bnProcBuf,int8_t* bnProcBufRes,int8_t* llrRes , int8_t* llrProcBuf, uint16_t Z ) {\n",ratestr[R]);
const uint8_t* lut_numBnInBnGroups;
const uint32_t* lut_startAddrBnGroups;
const uint16_t* lut_startAddrBnGroupsLlr;
if (R==0) {
lut_numBnInBnGroups = lut_numBnInBnGroups_BG1_R13;
lut_startAddrBnGroups = lut_startAddrBnGroups_BG1_R13;
lut_startAddrBnGroupsLlr = lut_startAddrBnGroupsLlr_BG1_R13;
}
else if (R==1){
lut_numBnInBnGroups = lut_numBnInBnGroups_BG1_R23;
lut_startAddrBnGroups = lut_startAddrBnGroups_BG1_R23;
lut_startAddrBnGroupsLlr = lut_startAddrBnGroupsLlr_BG1_R23;
}
else if (R==2) {
lut_numBnInBnGroups = lut_numBnInBnGroups_BG1_R89;
lut_startAddrBnGroups = lut_startAddrBnGroups_BG1_R89;
lut_startAddrBnGroupsLlr = lut_startAddrBnGroupsLlr_BG1_R89;
}
else { printf("aborting, illegal R %d\n",R); fclose(fd);abort();}
// Number of BNs in Groups
uint32_t k;
// Offset to each bit within a group in terms of 32 Byte
uint32_t cnOffsetInGroup;
uint8_t idxBnGroup = 0;
fprintf(fd," // Process group with 1 CN\n");
fprintf(fd," uint32_t M = (%d*Z + 15)>>4;\n",lut_numBnInBnGroups[0]);
fprintf(fd," simde__m128i* p_bnProcBuf = (simde__m128i*) &bnProcBuf [%d];\n",lut_startAddrBnGroups [idxBnGroup]);
fprintf(fd," simde__m128i* p_bnProcBufRes = (simde__m128i*) &bnProcBufRes [%d];\n",lut_startAddrBnGroups [idxBnGroup]);
fprintf(fd," simde__m128i* p_llrProcBuf = (simde__m128i*) &llrProcBuf [%d];\n",lut_startAddrBnGroupsLlr[idxBnGroup]);
fprintf(fd," simde__m128i* p_llrRes = (simde__m128i*) &llrRes [%d];\n",lut_startAddrBnGroupsLlr[idxBnGroup]);
fprintf(fd," simde__m128i ymm0, ymm1, ymmRes0, ymmRes1;\n");
fprintf(fd," for (int i=0;i<M;i++) {\n");
fprintf(fd," p_bnProcBufRes[i] = p_llrProcBuf[i];\n");
fprintf(fd," ymm0 = simde_mm_cvtepi8_epi16(p_bnProcBuf [i]);\n");
fprintf(fd," ymm1 = simde_mm_cvtepi8_epi16(p_llrProcBuf[i]);\n");
fprintf(fd," ymmRes0 = simde_mm_adds_epi16(ymm0, ymm1);\n");
fprintf(fd," ymm0 = simde_mm_cvtepi8_epi16(simde_mm_srli_si128(p_bnProcBuf [i],8));\n");
fprintf(fd," ymm1 = simde_mm_cvtepi8_epi16(simde_mm_srli_si128(p_llrProcBuf[i],8));\n");
fprintf(fd," ymmRes1 = simde_mm_adds_epi16(ymm0, ymm1);\n");
fprintf(fd," *p_llrRes = simde_mm_packs_epi16(ymmRes0, ymmRes1);\n");
fprintf(fd," p_llrRes++;\n");
fprintf(fd," }\n");
for (uint32_t cnidx=1;cnidx<30;cnidx++) {
// Process group with 4 CNs
if (lut_numBnInBnGroups[cnidx] > 0)
{
// If elements in group move to next address
idxBnGroup++;
fprintf(fd," M = (%d*Z + 15)>>4;\n",lut_numBnInBnGroups[cnidx]);
// Set the offset to each CN within a group in terms of 16 Byte
cnOffsetInGroup = (lut_numBnInBnGroups[cnidx]*NR_LDPC_ZMAX)>>4;
// Set pointers to start of group 2
fprintf(fd," p_bnProcBuf = (simde__m128i*) &bnProcBuf [%d];\n",lut_startAddrBnGroups[idxBnGroup]);
fprintf(fd," p_llrProcBuf = (simde__m128i*) &llrProcBuf [%d];\n",lut_startAddrBnGroupsLlr[idxBnGroup]);
fprintf(fd," p_llrRes = (simde__m128i*) &llrRes [%d];\n",lut_startAddrBnGroupsLlr[idxBnGroup]);
// Loop over BNs
fprintf(fd," for (int i=0;i<M;i++) {\n");
// First 16 LLRs of first CN
fprintf(fd," ymmRes0 = simde_mm_cvtepi8_epi16(p_bnProcBuf [i]);\n");
fprintf(fd," ymmRes1 = simde_mm_cvtepi8_epi16(simde_mm_srli_si128(p_bnProcBuf [i],8));\n");
// Loop over CNs
for (k=1; k<=cnidx; k++)
{
fprintf(fd," ymm0 = simde_mm_cvtepi8_epi16(p_bnProcBuf[%d + i]);\n", k*cnOffsetInGroup);
fprintf(fd," ymmRes0 = simde_mm_adds_epi16(ymmRes0, ymm0);\n");
fprintf(fd," ymm1 = simde_mm_cvtepi8_epi16(simde_mm_srli_si128(p_bnProcBuf[%d + i],8));\n", k*cnOffsetInGroup);
fprintf(fd, " ymmRes1 = simde_mm_adds_epi16(ymmRes1, ymm1); \n");
}
// Add LLR from receiver input
fprintf(fd," ymm0 = simde_mm_cvtepi8_epi16(p_llrProcBuf[i]);\n");
fprintf(fd," ymmRes0 = simde_mm_adds_epi16(ymmRes0, ymm0);\n");
fprintf(fd," ymm1 = simde_mm_cvtepi8_epi16(simde_mm_srli_si128(p_llrProcBuf[i],8));\n");
fprintf(fd," ymmRes1 = simde_mm_adds_epi16(ymmRes1, ymm1);\n");
// Pack results back to epi8
fprintf(fd," *p_llrRes = simde_mm_packs_epi16(ymmRes0, ymmRes1);\n");
fprintf(fd," p_llrRes++;\n");
fprintf(fd," }\n");
}
}
fprintf(fd,"}\n");
fclose(fd);
}//end of the function nrLDPC_bnProcPc_BG1
/*
* Licensed to the OpenAirInterface (OAI) Software Alliance under one or more
* contributor license agreements. See the NOTICE file distributed with
* this work for additional information regarding copyright ownership.
* The OpenAirInterface Software Alliance licenses this file to You under
* the OAI Public License, Version 1.1 (the "License"); you may not use this file
* except in compliance with the License.
* You may obtain a copy of the License at
*
* http://www.openairinterface.org/?page_id=698
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*-------------------------------------------------------------------------------
* For more information about the OpenAirInterface (OAI) Software Alliance:
* contact@openairinterface.org
*/
#include <stdio.h>
#include <stdint.h>
#include "PHY/sse_intrin.h"
#include "../../nrLDPCdecoder_defs.h"
#include "../../nrLDPC_types.h"
void nrLDPC_bnProcPc_BG2_generator_128(const char *dir, int R)
{
const char *ratestr[3]={"15","13","23"};
if (R<0 || R>2) {printf("Illegal R %d\n",R); abort();}
// system("mkdir -p ../ldpc_gen_files");
char fname[FILENAME_MAX+1];
snprintf(fname, sizeof(fname), "%s/bnProcPc128/nrLDPC_bnProcPc_BG2_R%s_128.h", dir, ratestr[R]);
FILE *fd=fopen(fname,"w");
if (fd == NULL) {
printf("Cannot create file %s\n", fname);
abort();
}
fprintf(fd,"#include <stdint.h>\n");
fprintf(fd,"#include \"PHY/sse_intrin.h\"\n");
fprintf(fd,"static inline void nrLDPC_bnProcPc_BG2_R%s_128(int8_t* bnProcBuf,int8_t* bnProcBufRes,int8_t* llrRes , int8_t* llrProcBuf, uint16_t Z ) {\n",ratestr[R]);
const uint8_t* lut_numBnInBnGroups;
const uint32_t* lut_startAddrBnGroups;
const uint16_t* lut_startAddrBnGroupsLlr;
if (R==0) {
lut_numBnInBnGroups = lut_numBnInBnGroups_BG2_R15;
lut_startAddrBnGroups = lut_startAddrBnGroups_BG2_R15;
lut_startAddrBnGroupsLlr = lut_startAddrBnGroupsLlr_BG2_R15;
}
else if (R==1){
lut_numBnInBnGroups = lut_numBnInBnGroups_BG2_R13;
lut_startAddrBnGroups = lut_startAddrBnGroups_BG2_R13;
lut_startAddrBnGroupsLlr = lut_startAddrBnGroupsLlr_BG2_R13;
}
else if (R==2) {
lut_numBnInBnGroups = lut_numBnInBnGroups_BG2_R23;
lut_startAddrBnGroups = lut_startAddrBnGroups_BG2_R23;
lut_startAddrBnGroupsLlr = lut_startAddrBnGroupsLlr_BG2_R23;
}
else { printf("aborting, illegal R %d\n",R); fclose(fd);abort();}
// Number of BNs in Groups
uint32_t k;
// Offset to each bit within a group in terms of 32 Byte
uint32_t cnOffsetInGroup;
uint8_t idxBnGroup = 0;
fprintf(fd," // Process group with 1 CN\n");
fprintf(fd," uint32_t M = (%d*Z + 15)>>4;\n",lut_numBnInBnGroups[0]);
fprintf(fd," simde__m128i* p_bnProcBuf = (simde__m128i*) &bnProcBuf [%d];\n",lut_startAddrBnGroups [idxBnGroup]);
fprintf(fd," simde__m128i* p_bnProcBufRes = (simde__m128i*) &bnProcBufRes [%d];\n",lut_startAddrBnGroups [idxBnGroup]);
fprintf(fd," simde__m128i* p_llrProcBuf = (simde__m128i*) &llrProcBuf [%d];\n",lut_startAddrBnGroupsLlr[idxBnGroup]);
fprintf(fd," simde__m128i* p_llrRes = (simde__m128i*) &llrRes [%d];\n",lut_startAddrBnGroupsLlr[idxBnGroup]);
fprintf(fd," simde__m128i ymm0, ymm1, ymmRes0, ymmRes1;\n");
fprintf(fd," for (int i=0;i<M;i++) {\n");
fprintf(fd," p_bnProcBufRes[i] = p_llrProcBuf[i];\n");
fprintf(fd," ymm0 = simde_mm_cvtepi8_epi16(p_bnProcBuf [i]);\n");
fprintf(fd," ymm1 = simde_mm_cvtepi8_epi16(p_llrProcBuf[i]);\n");
fprintf(fd," ymmRes0 = simde_mm_adds_epi16(ymm0, ymm1);\n");
fprintf(fd," ymm0 = simde_mm_cvtepi8_epi16(simde_mm_srli_si128(p_bnProcBuf [i],8));\n");
fprintf(fd," ymm1 = simde_mm_cvtepi8_epi16(simde_mm_srli_si128(p_llrProcBuf[i],8));\n");
fprintf(fd," ymmRes1 = simde_mm_adds_epi16(ymm0, ymm1);\n");
fprintf(fd," *p_llrRes = simde_mm_packs_epi16(ymmRes0, ymmRes1);\n");
fprintf(fd," p_llrRes++;\n");
fprintf(fd," }\n");
for (uint32_t cnidx=1;cnidx<30;cnidx++) {
// Process group with 4 CNs
if (lut_numBnInBnGroups[cnidx] > 0)
{
// If elements in group move to next address
idxBnGroup++;
fprintf(fd," M = (%d*Z + 15)>>4;\n",lut_numBnInBnGroups[cnidx]);
// Set the offset to each CN within a group in terms of 16 Byte
cnOffsetInGroup = (lut_numBnInBnGroups[cnidx]*NR_LDPC_ZMAX)>>4;
// Set pointers to start of group 2
fprintf(fd," p_bnProcBuf = (simde__m128i*) &bnProcBuf [%d];\n",lut_startAddrBnGroups[idxBnGroup]);
fprintf(fd," p_llrProcBuf = (simde__m128i*) &llrProcBuf [%d];\n",lut_startAddrBnGroupsLlr[idxBnGroup]);
fprintf(fd," p_llrRes = (simde__m128i*) &llrRes [%d];\n",lut_startAddrBnGroupsLlr[idxBnGroup]);
// Loop over BNs
fprintf(fd," for (int i=0;i<M;i++) {\n");
// First 16 LLRs of first CN
fprintf(fd," ymmRes0 = simde_mm_cvtepi8_epi16(p_bnProcBuf [i]);\n");
fprintf(fd," ymmRes1 = simde_mm_cvtepi8_epi16(simde_mm_srli_si128(p_bnProcBuf [i],8));\n");
// Loop over CNs
for (k=1; k<=cnidx; k++)
{
fprintf(fd," ymm0 = simde_mm_cvtepi8_epi16(p_bnProcBuf[%d + i]);\n", k*cnOffsetInGroup);
fprintf(fd," ymmRes0 = simde_mm_adds_epi16(ymmRes0, ymm0);\n");
fprintf(fd," ymm1 = simde_mm_cvtepi8_epi16(simde_mm_srli_si128(p_bnProcBuf[%d + i],8));\n", k*cnOffsetInGroup);
fprintf(fd, " ymmRes1 = simde_mm_adds_epi16(ymmRes1, ymm1); \n");
}
// Add LLR from receiver input
fprintf(fd," ymm0 = simde_mm_cvtepi8_epi16(p_llrProcBuf[i]);\n");
fprintf(fd," ymmRes0 = simde_mm_adds_epi16(ymmRes0, ymm0);\n");
fprintf(fd," ymm1 = simde_mm_cvtepi8_epi16(simde_mm_srli_si128(p_llrProcBuf[i],8));\n");
fprintf(fd," ymmRes1 = simde_mm_adds_epi16(ymmRes1, ymm1);\n");
// Pack results back to epi8
fprintf(fd," *p_llrRes = simde_mm_packs_epi16(ymmRes0, ymmRes1);\n");
fprintf(fd," p_llrRes++;\n");
fprintf(fd," }\n");
}
}
fprintf(fd,"}\n");
fclose(fd);
}//end of the function nrLDPC_bnProcPc_BG2
......@@ -4,7 +4,7 @@ add_executable(bnProc_gen_avx512
bnProcPc_gen_BG1_avx512.c
bnProcPc_gen_BG2_avx512.c
main.c)
target_compile_options(bnProc_gen_avx512 PRIVATE -W -Wall -mavx2)
target_compile_options(bnProc_gen_avx512 PRIVATE -W -Wall )
#set(bnProc_avx512_headers
# bnProc_avx512/rLDPC_bnProc_BG1_R13_AVX512.h
......
......@@ -2,7 +2,12 @@ add_executable(cnProc_gen_avx2
cnProc_gen_BG1_avx2.c
cnProc_gen_BG2_avx2.c
main.c)
target_compile_options(cnProc_gen_avx2 PRIVATE -W -Wall -mavx2)
add_executable(cnProc_gen_128
cnProc_gen_BG1_128.c
cnProc_gen_BG2_128.c
main128.c)
target_compile_options(cnProc_gen_avx2 PRIVATE -W -Wall )
target_compile_options(cnProc_gen_128 PRIVATE -W -Wall )
#set(cnProc_headers
# cnProc/rLDPC_cnProc_BG1_R13_AVX2.h
......@@ -20,6 +25,17 @@ add_custom_command(TARGET cnProc_gen_avx2 POST_BUILD
COMMENT "Generating LDPC cnProc header files for AVX2"
)
add_custom_command(TARGET cnProc_gen_128 POST_BUILD
#OUTPUT ${cnProc_headers}
COMMAND ${CMAKE_COMMAND} -E make_directory cnProc128
COMMAND cnProc_gen_128 .
DEPENDS cnProc_gen_128
COMMENT "Generating LDPC cnProc header files for 128-bit SIMD"
)
add_library(cnProc_gen_avx2_HEADERS INTERFACE)
target_include_directories(cnProc_gen_avx2_HEADERS INTERFACE ${CMAKE_CURRENT_BINARY_DIR})
add_dependencies(cnProc_gen_avx2_HEADERS cnProc_gen_avx2)
add_library(cnProc_gen_128_HEADERS INTERFACE)
target_include_directories(cnProc_gen_128_HEADERS INTERFACE ${CMAKE_CURRENT_BINARY_DIR})
add_dependencies(cnProc_gen_128_HEADERS cnProc_gen_128)
This diff is collapsed.
This diff is collapsed.
This diff is collapsed.
This diff is collapsed.
This diff is collapsed.
This diff is collapsed.
This diff is collapsed.
This diff is collapsed.
This diff is collapsed.
This diff is collapsed.
This diff is collapsed.
This diff is collapsed.
This diff is collapsed.
This diff is collapsed.
This diff is collapsed.
This diff is collapsed.
This diff is collapsed.
This diff is collapsed.
This diff is collapsed.
This diff is collapsed.
This diff is collapsed.
This diff is collapsed.
This diff is collapsed.
This diff is collapsed.
This diff is collapsed.
This diff is collapsed.
This diff is collapsed.
This diff is collapsed.
This diff is collapsed.
This diff is collapsed.
This diff is collapsed.
This diff is collapsed.
This diff is collapsed.
This diff is collapsed.
This diff is collapsed.
This diff is collapsed.
This diff is collapsed.
This diff is collapsed.
This diff is collapsed.
This diff is collapsed.
This diff is collapsed.
This diff is collapsed.
This diff is collapsed.
This diff is collapsed.
This diff is collapsed.
This diff is collapsed.
This diff is collapsed.
This diff is collapsed.
This diff is collapsed.
This diff is collapsed.
This diff is collapsed.
This diff is collapsed.
This diff is collapsed.
This diff is collapsed.
This diff is collapsed.
This diff is collapsed.
This diff is collapsed.
This diff is collapsed.
This diff is collapsed.
Markdown is supported
0%
or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment