adaptations of LDPC decoder generator for ARMv8 performance

4cd25ba6 · Raymond · laurent · 81a2da50 · 4cd25ba6 · 4cd25ba6
Commit 4cd25ba6 authored Dec 27, 2022 by Raymond Committed by laurent Apr 11, 2023
6 changed files
--- a/CMakeLists.txt
+++ b/CMakeLists.txt
@@ -245,7 +245,7 @@ set(CMAKE_EXE_LINKER_FLAGS  "${CMAKE_EXE_LINKER_FLAGS} -ggdb2 -Wl,-rpath -Wl,${C
 # these changes are related to hardcoded path to include .h files
 set(debugOpt "-ggdb2 -DMALLOC_CHECK_=3 -fno-delete-null-pointer-checks")
 set(CMAKE_C_FLAGS_DEBUG "${debugOpt} -O0")
-set(CMAKE_C_FLAGS_RELWITHDEBINFO "${debugOpt} -O2")
+set(CMAKE_C_FLAGS_RELWITHDEBINFO "${debugOpt} -O3")
 set(CMAKE_C_FLAGS_RELEASE "-O3")

 # Enable assert() for RelWithDebInfo builds

--- a/common/utils/time_meas.c
+++ b/common/utils/time_meas.c
@@ -38,7 +38,7 @@ static time_stats_t  **measur_table;
 notifiedFIFO_t measur_fifo;
 double get_cpu_freq_GHz(void)
 {
-  if (cpu_freq_GHz <1 ) {
+  if (cpu_freq_GHz <0.01 ) {
    time_stats_t ts = {0};
    reset_meas(&ts);
    ts.trials++;
@@ -46,8 +46,7 @@ double get_cpu_freq_GHz(void)
    sleep(1);
    ts.diff = (rdtsc_oai()-ts.in);
    cpu_freq_GHz = (double)ts.diff/1000000000;
-    printf("CPU Freq is %f \n", cpu_freq_GHz);
-  }
+  } 
  return cpu_freq_GHz;
 }


--- a/openair1/PHY/CODING/TESTBENCH/coding_unitary_defs.h
+++ b/openair1/PHY/CODING/TESTBENCH/coding_unitary_defs.h
@@ -34,9 +34,9 @@ void exit_function(const char* file, const char* function, const int line, const
  exit(-1);
 }

-signed char quantize(double D, double x, unsigned char B) {
+int8_t quantize(double D, double x, uint8_t B) {
  double qxd;
-  short maxlev;
+  int16_t maxlev;
  qxd = floor(x / D);
  maxlev = 1 << (B - 1); //(char)(pow(2,B-1));

@@ -45,7 +45,7 @@ signed char quantize(double D, double x, unsigned char B) {
  else if (qxd >= maxlev)
    qxd = maxlev - 1;

-  return ((char) qxd);
+  return ((int8_t) qxd);
 }



--- a/openair1/PHY/CODING/TESTBENCH/ldpctest.c
+++ b/openair1/PHY/CODING/TESTBENCH/ldpctest.c
@@ -23,6 +23,7 @@
 #include <math.h>
 #include <stdio.h>
 #include <string.h>
+#include <stdint.h>
 #include "assertions.h"
 #include "SIMULATION/TOOLS/sim.h"
 #include "common/utils/load_module_shlib.h"
@@ -43,7 +44,7 @@
 #define NR_LDPC_ENABLE_PARITY_CHECK

 // 4-bit quantizer
-char quantize4bit(double D,double x)
+int8_t quantize4bit(double D,double x)
 {
  double qxd;
  qxd = floor(x/D);
@@ -54,13 +55,13 @@ char quantize4bit(double D,double x)
  else if (qxd > 7)
    qxd = 7;

-  return((char)qxd);
+  return((int8_t)qxd);
 }

-char quantize8bit(double D,double x)
+int8_t quantize8bit(double D,double x)
 {
  double qxd;
-  //char maxlev;
+  //int8_t maxlev;
  qxd = floor(x/D);

  //maxlev = 1<<(B-1);
@@ -72,7 +73,7 @@ char quantize8bit(double D,double x)
  else if (qxd >= 128)
    qxd = 127;

-  return((char)qxd);
+  return((int8_t)qxd);
 }

 typedef struct {
@@ -92,7 +93,7 @@ int test_ldpc(short max_iterations,
              int nom_rate,
              int denom_rate,
              double SNR,
-              unsigned char qbits,
+              uint8_t qbits,
              short block_length,
              unsigned int ntrials,
              int n_segments,
@@ -117,15 +118,15 @@ int test_ldpc(short max_iterations,
  sigma = 1.0/sqrt(2*SNR);
  opp_enabled=1;
  //short test_input[block_length];
-  unsigned char *test_input[MAX_NUM_NR_DLSCH_SEGMENTS_PER_LAYER*NR_MAX_NB_LAYERS]={NULL,NULL,NULL,NULL,NULL,NULL,NULL,NULL};;
+  uint8_t *test_input[MAX_NUM_NR_DLSCH_SEGMENTS_PER_LAYER*NR_MAX_NB_LAYERS]={NULL,NULL,NULL,NULL,NULL,NULL,NULL,NULL};;
  //short *c; //padded codeword
-  unsigned char estimated_output[MAX_NUM_DLSCH_SEGMENTS][block_length];
+  uint8_t estimated_output[MAX_NUM_DLSCH_SEGMENTS][block_length];
  memset(estimated_output, 0, sizeof(estimated_output));
-  unsigned char *channel_input[MAX_NUM_DLSCH_SEGMENTS];
-  unsigned char *channel_input_optim[MAX_NUM_DLSCH_SEGMENTS];
+  uint8_t *channel_input[MAX_NUM_DLSCH_SEGMENTS];
+  uint8_t *channel_input_optim[MAX_NUM_DLSCH_SEGMENTS];
  //double channel_output[68 * 384];
  double modulated_input[MAX_NUM_DLSCH_SEGMENTS][68 * 384] = { 0 };
-  char channel_output_fixed[MAX_NUM_DLSCH_SEGMENTS][68  * 384] = { 0 };
+  int8_t channel_output_fixed[MAX_NUM_DLSCH_SEGMENTS][68  * 384] = { 0 };
  short BG=0,nrows=0;//,ncols;
  int no_punctured_columns,removed_bit;
  int i1,Zc,Kb=0;
@@ -149,12 +150,12 @@ int test_ldpc(short max_iterations,

  // generate input block
  for(int j=0;j<MAX_NUM_DLSCH_SEGMENTS;j++) {
-    test_input[j]=(unsigned char *)malloc16(sizeof(unsigned char) * block_length/8);
-    memset(test_input[j], 0, sizeof(unsigned char) * block_length / 8);
-    channel_input[j] = (unsigned char *)malloc16(sizeof(unsigned char) * 68*384);
-    memset(channel_input[j], 0, sizeof(unsigned char) * 68 * 384);
-    channel_input_optim[j] = (unsigned char *)malloc16(sizeof(unsigned char) * 68*384);
-    memset(channel_input_optim[j], 0, sizeof(unsigned char) * 68 * 384);
+    test_input[j]=(uint8_t *)malloc16(sizeof(uint8_t) * block_length/8);
+    memset(test_input[j], 0, sizeof(uint8_t) * block_length / 8);
+    channel_input[j] = (uint8_t *)malloc16(sizeof(uint8_t) * 68*384);
+    memset(channel_input[j], 0, sizeof(uint8_t) * 68 * 384);
+    channel_input_optim[j] = (uint8_t *)malloc16(sizeof(uint8_t) * 68*384);
+    memset(channel_input_optim[j], 0, sizeof(uint8_t) * 68 * 384);
  }

  reset_meas(&time);
@@ -179,7 +180,7 @@ int test_ldpc(short max_iterations,

  for (int j=0;j<MAX_NUM_DLSCH_SEGMENTS;j++) {
    for (int i=0; i<block_length/8; i++) {
-      test_input[j][i]=(unsigned char) rand();
+      test_input[j][i]=(uint8_t) rand();
      //test_input[j][i]=j%256;
      //test_input[j][i]=252;
    }
@@ -326,21 +327,21 @@ int test_ldpc(short max_iterations,
            modulated_input[j][i]=-1.0;///sqrt(2);

          ///channel_output[i] = modulated_input[i] + gaussdouble(0.0,1.0) * 1/sqrt(2*SNR);
-          //channel_output_fixed[i] = (char) ((channel_output[i]*128)<0?(channel_output[i]*128-0.5):(channel_output[i]*128+0.5)); //fixed point 9-7
+          //channel_output_fixed[i] = (int8_t) ((channel_output[i]*128)<0?(channel_output[i]*128-0.5):(channel_output[i]*128+0.5)); //fixed point 9-7
          //printf("llr[%d]=%d\n",i,channel_output_fixed[i]);

-          //channel_output_fixed[i] = (char)quantize(sigma/4.0,(2.0*modulated_input[i]) - 1.0 + sigma*gaussdouble(0.0,1.0),qbits);
-          channel_output_fixed[j][i] = (char)quantize(sigma/4.0/4.0,modulated_input[j][i] + sigma*gaussdouble(0.0,1.0),qbits);
-          //channel_output_fixed[i] = (char)quantize8bit(sigma/4.0,(2.0*modulated_input[i]) - 1.0 + sigma*gaussdouble(0.0,1.0));
+          //channel_output_fixed[i] = (int8_t)quantize(sigma/4.0,(2.0*modulated_input[i]) - 1.0 + sigma*gaussdouble(0.0,1.0),qbits);
+          channel_output_fixed[j][i] = (int8_t)quantize(sigma/4.0/4.0,modulated_input[j][i] + sigma*gaussdouble(0.0,1.0),qbits);
+          //channel_output_fixed[i] = (int8_t)quantize8bit(sigma/4.0,(2.0*modulated_input[i]) - 1.0 + sigma*gaussdouble(0.0,1.0));
          //printf("llr[%d]=%d\n",i,channel_output_fixed[i]);
          //printf("channel_output_fixed[%d]: %d\n",i,channel_output_fixed[i]);


          //Uncoded BER
-          unsigned char channel_output_uncoded = channel_output_fixed[j][i]<0 ? 1 /* QPSK demod */ : 0;
+          uint8_t channel_output_uncoded = channel_output_fixed[j][i]<0 ? 1 /* QPSK demod */ : 0;

          if (channel_output_uncoded != channel_input_optim[j][i-2*Zc])
-      *errors_bit_uncoded = (*errors_bit_uncoded) + 1;
+            *errors_bit_uncoded = (*errors_bit_uncoded) + 1;

        }
     
@@ -367,8 +368,8 @@ int test_ldpc(short max_iterations,
        }
        for (int i=0; i<block_length; i++)
        {
-          unsigned char estoutputbit = (estimated_output[j][i/8]&(1<<(i&7)))>>(i&7);
-          unsigned char inputbit = (test_input[j][i/8]&(1<<(i&7)))>>(i&7); // Further correct for multiple segments
+          uint8_t estoutputbit = (estimated_output[j][i/8]&(1<<(i&7)))>>(i&7);
+          uint8_t inputbit = (test_input[j][i/8]&(1<<(i&7)))>>(i&7); // Further correct for multiple segments
          if (estoutputbit != inputbit)
            *errors_bit = (*errors_bit) + 1;
        }
@@ -423,13 +424,13 @@ int test_ldpc(short max_iterations,
  return *errors;
 }

-int main(int argc, char *argv[])
+int main(int argc, int8_t *argv[])
 {

  unsigned int errors, errors_bit, crc_misses;
  double errors_bit_uncoded;
  short block_length=8448; // decoder supports length: 1201 -> 1280, 2401 -> 2560
-  char *ldpc_version=NULL; /* version of the ldpc decoder library to use (XXX suffix to use when loading libldpc_XXX.so */
+  int8_t *ldpc_version=NULL; /* version of the ldpc decoder library to use (XXX suffix to use when loading libldpc_XXX.so */
  short max_iterations=5;
  int n_segments=1;
  //double rate=0.333;
@@ -437,7 +438,7 @@ int main(int argc, char *argv[])
  int nom_rate=1;
  int denom_rate=3;
  double SNR0=-2.0,SNR,SNR_lin;
-  unsigned char qbits=8;
+  uint8_t qbits=8;
  unsigned int decoded_errors[10000]; // initiate the size of matrix equivalent to size of SNR
  int c,i=0, i1 = 0;


--- a/openair1/PHY/CODING/nrLDPC_decoder/nrLDPC_decoder.c
+++ b/openair1/PHY/CODING/nrLDPC_decoder/nrLDPC_decoder.c
@@ -112,7 +112,7 @@


 #define NR_LDPC_ENABLE_PARITY_CHECK
-//#define NR_LDPC_PROFILER_DETAIL
+#define NR_LDPC_PROFILER_DETAIL

 #ifdef NR_LDPC_DEBUG_MODE
 #include "nrLDPC_tools/nrLDPC_debug.h"

--- a/openair1/PHY/CODING/nrLDPC_decoder/nrLDPC_tools/generator_cnProc/cnProc_gen_BG1_avx2.c
+++ b/openair1/PHY/CODING/nrLDPC_decoder/nrLDPC_tools/generator_cnProc/cnProc_gen_BG1_avx2.c