Merge remote-tracking branch 'origin/NR_ldpc_cuda_softmodem' into integration_2021_wk44

6a5d8b86 · hardy · af7d9cd9 · 8dd5346e · 6a5d8b86 · 6a5d8b86
Commit 6a5d8b86 authored Oct 27, 2021 by hardy
18 changed files
--- a/cmake_targets/CMakeLists.txt
+++ b/cmake_targets/CMakeLists.txt
@@ -309,6 +309,7 @@ endif()

 #
 # add autotools definitions that were maybe used!
+
 add_definitions("-DSTDC_HEADERS=1 -DHAVE_SYS_TYPES_H=1 -DHAVE_SYS_STAT_H=1 -DHAVE_STDLIB_H=1 -DHAVE_STRING_H=1 -DHAVE_MEMORY_H=1 -DHAVE_STRINGS_H=1 -DHAVE_INTTYPES_H=1 -DHAVE_STDINT_H=1 -DHAVE_UNISTD_H=1 -DHAVE_FCNTL_H=1 -DHAVE_ARPA_INET_H=1 -DHAVE_SYS_TIME_H=1 -DHAVE_SYS_SOCKET_H=1 -DHAVE_STRERROR=1 -DHAVE_SOCKET=1 -DHAVE_MEMSET=1 -DHAVE_GETTIMEOFDAY=1 -DHAVE_STDLIB_H=1 -DHAVE_MALLOC=1 -DHAVE_LIBSCTP")

 set(commonOpts "-pipe -Wno-packed-bitfield-compat -fPIC -Wall -fno-strict-aliasing -rdynamic")
@@ -318,11 +319,7 @@ set(CMAKE_C_FLAGS
 set(CMAKE_CXX_FLAGS
  "${CMAKE_CXX_FLAGS} ${C_FLAGS_PROCESSOR}  ${commonOpts} -std=c++11")

-# cuda compiler bug (limitation) on complex macro definition
-if (CUDA_FOUND)
-  set(CMAKE_C_FLAGS "${CMAKE_C_FLAGS} -DCUDA_FLAG")
-  set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} -DCUDA_FLAG")
-endif()
+

 if (SANITIZE_ADDRESS)
  set(CMAKE_C_FLAGS "${CMAKE_C_FLAGS} -fsanitize=address -fno-common")
@@ -1590,6 +1587,16 @@ set(PHY_LDPC_OPTIM8SEGMULTI_SRC
  ${OPENAIR1_DIR}/PHY/CODING/nrLDPC_decoder/nrLDPC_decoder.c
  ${OPENAIR1_DIR}/PHY/CODING/nrLDPC_encoder/ldpc_encoder_optim8segmulti.c
 )
+set(PHY_LDPC_CUDA_SRC
+  ${OPENAIR1_DIR}/PHY/CODING/nrLDPC_decoder_LYC/nrLDPC_decoder_LYC.cu
+  ${OPENAIR1_DIR}/PHY/CODING/nrLDPC_encoder/ldpc_encoder_optim8segmulti.c
+)
+
+set(PHY_LDPC_CL_SRC
+  ${OPENAIR1_DIR}/PHY/CODING/nrLDPC_decoder/nrLDPC_decoder_CL.c
+  ${OPENAIR1_DIR}/PHY/CODING/nrLDPC_encoder/ldpc_encoder_optim8segmulti.c
+)
+
 set(PHY_NR_CODINGIF
  ${OPENAIR1_DIR}/PHY/CODING/nrLDPC_load.c;
 )
@@ -1597,8 +1604,18 @@ set(PHY_NR_CODINGIF
 add_library(ldpc_orig MODULE ${PHY_LDPC_ORIG_SRC} )
 add_library(ldpc_optim MODULE ${PHY_LDPC_OPTIM_SRC} )
 add_library(ldpc_optim8seg MODULE ${PHY_LDPC_OPTIM8SEG_SRC} )
+add_library(ldpc_cl MODULE ${PHY_LDPC_CL_SRC} )
+target_link_libraries(ldpc_cl OpenCL)
+
+if (CUDA_FOUND)
+  cuda_add_library(ldpc_cuda MODULE ${PHY_LDPC_CUDA_SRC} )
+  set_target_properties(ldpc_cuda PROPERTIES CUDA_SEPARABLE_COMPILATION ON)
+#  CUDA_ADD_CUFFT_TO_TARGET(ldpc_cuda)
+endif (CUDA_FOUND)
+
 add_library(ldpc MODULE ${PHY_LDPC_OPTIM8SEGMULTI_SRC} )

+
 add_library(coding MODULE ${PHY_TURBOSRC} )

 add_library(dfts MODULE ${OPENAIR1_DIR}/PHY/TOOLS/oai_dfts.c )
@@ -3119,7 +3136,11 @@ target_link_libraries (nr-uesoftmodem ${LIB_LMS_LIBRARIES})
 target_link_libraries (nr-uesoftmodem ${T_LIB})

 add_dependencies( nr-uesoftmodem ldpc_orig ldpc_optim ldpc_optim8seg ldpc )
-
+if (CUDA_FOUND)
+   add_dependencies( nr-uesoftmodem ldpc_cuda)
+   add_dependencies( nr-softmodem ldpc_cuda)
+   add_dependencies( ocp-gnb ldpc_cuda)
+endif (CUDA_FOUND)
 ###################################"
 # Addexecutables for tests
 ####################################
@@ -3179,46 +3200,18 @@ target_link_libraries(smallblocktest
  m pthread ${ATLAS_LIBRARIES} dl
  )

-if (CUDA_FOUND)
-###################################################
-# For CUDA library
-###################################################
-
-    CUDA_ADD_LIBRARY(LDPC_CU
-      ${OPENAIR1_DIR}/PHY/CODING/nrLDPC_decoder_LYC/nrLDPC_decoder_LYC.cu
-      )
-    CUDA_ADD_CUFFT_TO_TARGET(LDPC_CU)
-    cuda_add_executable(ldpctest
-      ${OPENAIR1_DIR}/PHY/CODING/TESTBENCH/ldpctest.c
-      ${T_SOURCE}
-      ${SHLIB_LOADER_SOURCES}
-      )
-    target_link_libraries(ldpctest -ldl
-      -Wl,--start-group
-      LDPC_CU UTIL SIMU PHY_NR CONFIG_LIB
-      -Wl,--end-group
-      m pthread ${ATLAS_LIBRARIES} dl
-      )

-else (CUDA_FOUND)
-    add_executable(ldpctest
+add_executable(ldpctest
   ${PHY_NR_CODINGIF}
   ${OPENAIR1_DIR}/PHY/CODING/TESTBENCH/ldpctest.c
   ${T_SOURCE}
   ${SHLIB_LOADER_SOURCES}
   )

-endif ()
-
-
-# add_executable(ldpctest
-  # ${PHY_NR_CODINGIF}
-  # ${OPENAIR1_DIR}/PHY/CODING/TESTBENCH/ldpctest.c
-  # ${T_SOURCE}
-  # ${SHLIB_LOADER_SOURCES}
-  # )
 add_dependencies( ldpctest ldpc_orig ldpc_optim ldpc_optim8seg ldpc )
-
+if (CUDA_FOUND)
+   add_dependencies( ldpctest ldpc_cuda)
+endif (CUDA_FOUND)
 target_link_libraries(ldpctest
  -Wl,--start-group UTIL SIMU_COMMON  SIMU PHY_NR PHY_COMMON PHY_NR_COMMON CONFIG_LIB -Wl,--end-group
  m pthread ${ATLAS_LIBRARIES} dl

--- a/common/utils/DOC/loader/devusage/api.md
+++ b/common/utils/DOC/loader/devusage/api.md
@@ -10,6 +10,10 @@ int load_module_shlib(char *modname,loader_shlibfunc_t *farray, int numf)
 * If the farray pointer is null,  looks for `< modname >_getfarray` symbol, calls the corresponding function when the symbol is found. `< modname >_getfarray` takes one argument, a pointer to a  `loader_shlibfunc_t` array, and returns the number of items in this array, as defined by the `getfarrayfunc_t` type. The `loader_shlibfunc_t` array returned by the shared library must be fully filled (both `fname` and `fptr` fields).
 * looks for the `numf` function symbols listed in the `farray[i].fname` arguments and set the corresponding `farray[i].fptr`function pointers

+```c
+int load_module_version_shlib(char *modname, char *version, loader_shlibfunc_t *farray, int numf)
+```
+Allows loading a specific library version, as specified by the `version` parameter. When version is not NULL the version that is possibly specified as a config module parameter is ignored. This call has been introduced for phy simulators executables which do not use the config module. It is used, for example,  by the ldcp initialization (`load_nrLDPClib` function in [nrLDPC_load.c](../../../../../openair1/PHY/CODING/nrLDPC_load.c)  to allow the `ldpctest` simulator to select the cuda accelerated ldcp implementation. `load_module_shlib` is just a define macro to switch to a `load_module_version_shlib` call,  adding a NULL pointer for the version parameter.

 ```c
 void * get_shlibmodule_fptr(char *modname, char *fname)

--- a/common/utils/load_module_shlib.c
+++ b/common/utils/load_module_shlib.c
@@ -63,7 +63,7 @@ void loader_init(void) {
 }

 /* build the full shared lib name from the module name */
-char *loader_format_shlibpath(char *modname)
+char *loader_format_shlibpath(char *modname, char *version)
 {

 char *tmpstr;
@@ -97,7 +97,10 @@ int ret;
       shlibpath =  loader_data.shlibpath ;
   } 
 /* no specific shared lib version */
-   if (shlibversion == NULL) {
+   if (version != NULL) {    // version specified as a function parameter
+	   shlibversion=version;
+   }
+   if (shlibversion == NULL) {  // no specific version specified, neither as a config param or as a function param
       shlibversion = "" ;
   } 
 /* alloc memory for full module shared lib file name */
@@ -118,7 +121,7 @@ int ret;
   return tmpstr; 
 }

-int load_module_shlib(char *modname,loader_shlibfunc_t *farray, int numf, void *autoinit_arg)
+int load_module_version_shlib(char *modname, char *version, loader_shlibfunc_t *farray, int numf, void *autoinit_arg)
 {
  void *lib_handle = NULL;
  initfunc_t fpi;
@@ -138,7 +141,7 @@ int load_module_shlib(char *modname,loader_shlibfunc_t *farray, int numf, void *
     loader_init();
  }

-  shlib_path = loader_format_shlibpath(modname);
+  shlib_path = loader_format_shlibpath(modname, version);

  for (int i = 0; i < loader_data.numshlibs; i++) {
    if (strcmp(loader_data.shlibs[i].name, modname) == 0) {

--- a/common/utils/load_module_shlib.h
+++ b/common/utils/load_module_shlib.h
@@ -84,10 +84,11 @@ loader_data_t loader_data;

 /*-------------------------------------------------------------------------------------------------------------*/
 #else  /* LOAD_MODULE_SHLIB_MAIN */
-extern int load_module_shlib(char *modname, loader_shlibfunc_t *farray, int numf, void *initfunc_arg);
+
+extern int load_module_version_shlib(char *modname, char *version, loader_shlibfunc_t *farray, int numf, void *initfunc_arg);
 extern void * get_shlibmodule_fptr(char *modname, char *fname);
 extern loader_data_t loader_data;
 #endif /* LOAD_MODULE_SHLIB_MAIN */
-
+#define load_module_shlib(M, F, N, I) load_module_version_shlib(M, NULL, F, N, I)
 #endif

--- a/doc/RUNMODEM.md
+++ b/doc/RUNMODEM.md
@@ -203,10 +203,7 @@ UE on machine 2:



-
-
-
-
+[Selecting an alternative ldpc implementation at run time](../openair1/PHY/CODING/DOC/LDPCImplementation.md)

 [oai wiki home](https://gitlab.eurecom.fr/oai/openairinterface5g/wikis/home)


--- a/executables/nr-uesoftmodem.c
+++ b/executables/nr-uesoftmodem.c
@@ -436,7 +436,7 @@ int main( int argc, char **argv ) {
  itti_init(TASK_MAX, tasks_info);

  init_opt() ;
-  load_nrLDPClib();
+  load_nrLDPClib(NULL);

  if (ouput_vcd) {
    vcd_signal_dumper_init("/tmp/openair_dump_nrUE.vcd");

--- a/openair1/PHY/CODING/DOC/LDPCImplementation.md
+++ b/openair1/PHY/CODING/DOC/LDPCImplementation.md
-#LDPC coder/decoder implementation
+# LDPC coder/decoder implementation
 The LDPC coder and decoder are implemented in a shared library, dynamically loaded at run-time using the [oai shared library loader](file://../../../../common/utils/DOC/loader.md). The code loading the LDPC library is in [nrLDPC_load.c](file://../nrLDPC_load.c), in function `load_nrLDPClib`, which must be called at init time.

 ## Selecting the LDPC library at run time

 By default the function `int load_nrLDPClib(void)` looks for `libldpc.so`, this default behavior can be changed using the oai loader configuration options in the configuration file or from the command line as shown below:

->loading `libldpc_optim8seg.so` instead of `libldpc.so`
+#### Examples of ldpc shared lib selection when running nr softmodem's:
+
+loading `libldpc_optim8seg.so` instead of `libldpc.so`:

 ```
 ./nr-softmodem -O libconfig:gnb.band78.tm1.106PRB.usrpx300.conf:dbgl5  --loader.ldpc.shlibversion _optim8seg
@@ -18,9 +20,160 @@ By default the function `int load_nrLDPClib(void)` looks for `libldpc.so`, this
 ........................
 ```

-Today, this mechanism is not available in the `ldpctest` phy simulator which doesn't initialize the [configuration module](file://../../../../common/config/DOC/config.md). loads `libldpc.so` and `libldpc_orig.so` to compare the performance of the two implementations.
+loading `libldpc_cl.so` instead of `libldpc.so`:
+
+`make ldpc_cl`
+
+`cp ../../../openair1/PHY/CODING/nrLDPC_decoder/nrLDPC_decoder_kernels_CL.cl`
+
+`./nr-softmodem -O  libconfig:gnb.band78.sa.fr1.106PRB.usrpb210.conf:dbgl5 --rfsim --rfsimulator.serveraddr server  --sa --log_config.gtpu_log_level info  --loader.ldpc.shlibversion _cl`
+
+``` [LOADER] library libldpc_cl.so successfully loaded
+------------------------------------------------
+[HW]   Platform 0, OpenCL profile FULL_PROFILE
+[HW]   Platform 0, OpenCL version OpenCL 2.1 LINUX
+[HW]   Device 0 is  available
+[HW]   Device 0, type 2 = 0x00000002: cpu 
+[HW]   Device 0, number of Compute Units: 8
+[HW]   Device 0, max Work Items dimension: 3
+[HW]   Device 0, max Work Items size for dimension: 0 8192
+[HW]   Device 0, max Work Items size for dimension: 1 8192
+[HW]   Device 0, max Work Items size for dimension: 2 8192
+[New Thread 0x7fffcc258700 (LWP 3945123)]
+[New Thread 0x7fffc3e57700 (LWP 3945124)]
+[New Thread 0x7fffcbe57700 (LWP 3945125)]
+[New Thread 0x7fffcba56700 (LWP 3945126)]
+[New Thread 0x7fffcb254700 (LWP 3945128)]
+[New Thread 0x7fffcb655700 (LWP 3945127)]
+[New Thread 0x7fffcae53700 (LWP 3945129)]
+[HW]   Platform 1, OpenCL profile FULL_PROFILE
+[HW]   Platform 1, OpenCL version OpenCL 2.0 beignet 1.3
+[New Thread 0x7fffc965a700 (LWP 3945130)]
+[Thread 0x7fffc965a700 (LWP 3945130) exited]
+[HW]   Device 0 is  available
+[HW]   Device 0, type 4 = 0x00000004: gpu 
+[HW]   Device 0, number of Compute Units: 20
+[HW]   Device 0, max Work Items dimension: 3
+[HW]   Device 0, max Work Items size for dimension: 0 512
+[HW]   Device 0, max Work Items size for dimension: 1 512
+[HW]   Device 0, max Work Items size for dimension: 2 512
+-----------------------------------------------------------------
+```
+
+`./nr-uesoftmodem -r 106 --numerology 1 --band 78 -C 3619200000 --rfsim --sa -O libconfig:/usr/local/oai/conf/nrue_sim.conf:dbgl5  --nokrnmod --loader.ldpc.shlibversion _cl --log_config.hw_log_level info`
+
+```[CONFIG] shlibversion set to  _cl from command line
+............................................................
+[CONFIG] loader.ldpc 1 options set from command line
+[LOADER] library libldpc_cl.so successfully loaded
+[HW]   Platform 0, OpenCL profile FULL_PROFILE
+[HW]   Platform 0, OpenCL version OpenCL 2.1 LINUX
+[HW]   Device 0 is  available
+[HW]   Device 0, type 2 = 0x00000002: cpu 
+[HW]   Device 0, number of Compute Units: 8
+[HW]   Device 0, max Work Items dimension: 3
+[HW]   Device 0, max Work Items size for dimension: 0 8192
+[HW]   Device 0, max Work Items size for dimension: 1 8192
+[HW]   Device 0, max Work Items size for dimension: 2 8192
+[New Thread 0x7fffecccc700 (LWP 3945413)]
+[New Thread 0x7fffec8cb700 (LWP 3945415)]
+[New Thread 0x7fffec4ca700 (LWP 3945414)]
+[New Thread 0x7fffdf7fd700 (LWP 3945417)]
+[New Thread 0x7fffdfbfe700 (LWP 3945418)]
+[New Thread 0x7fffdffff700 (LWP 3945416)]
+[New Thread 0x7fffd73fc700 (LWP 3945419)]
+[HW]   Platform 1, OpenCL profile FULL_PROFILE
+[HW]   Platform 1, OpenCL version OpenCL 2.0 beignet 1.3
+[New Thread 0x7fffde105700 (LWP 3945420)]
+[Thread 0x7fffde105700 (LWP 3945420) exited]
+[HW]   Device 0 is  available
+[HW]   Device 0, type 4 = 0x00000004: gpu 
+[HW]   Device 0, number of Compute Units: 20
+[HW]   Device 0, max Work Items dimension: 3
+[HW]   Device 0, max Work Items size for dimension: 0 512
+[HW]   Device 0, max Work Items size for dimension: 1 512
+[HW]   Device 0, max Work Items size for dimension: 2 512 
+------------------------------------------------------------
+```
+```
+
+A mechanism to select ldpc implementation is also available in the `ldpctest` phy simulator via the `-v`option, which can be used to specify the version of the ldpc shared library to be used.
+
+#### Examples of ldpc shared lib selection when running ldpctest:
+
+Loading libldpc_cuda.so, the cuda implementation of the ldpc decoder:
+
+```$ ./ldpctest -v _cuda
+Initializing random number generator, seed 0
+block length 8448: 
+n_trials 1: 
+SNR0 -2.000000: 
+[CONFIG] get parameters from cmdline , debug flags: 0x00400000
+[CONFIG] log_config: 2/3 parameters successfully set 
+[CONFIG] log_config: 53/53 parameters successfully set 
+[CONFIG] log_config: 53/53 parameters successfully set 
+[CONFIG] log_config: 16/16 parameters successfully set 
+[CONFIG] log_config: 16/16 parameters successfully set 
+log init done
+[CONFIG] loader: 2/2 parameters successfully set 
+[CONFIG] loader.ldpc: 1/2 parameters successfully set 
+[LOADER] library libldpc_cuda.so successfully loaded
+...................................
+```
+```
+
+Loading libldpc_cl.so, the opencl implementation of the ldpc decoder:
+
+`make ldpc_cl`
+
+`cp ../../../openair1/PHY/CODING/nrLDPC_decoder/nrLDPC_decoder_kernels_CL.cl`
+
+`./ldpctest -v _cl`
+
+```$ ./ldpctest -v _cl
+Initializing random number generator, seed 0
+block length 8448: 
+n_trials 1: 
+SNR0 -2.000000: 
+[CONFIG] get parameters from cmdline , debug flags: 0x00400000
+[CONFIG] log_config: 2/3 parameters successfully set 
+[CONFIG] log_config: 53/53 parameters successfully set 
+[CONFIG] log_config: 53/53 parameters successfully set 
+[CONFIG] log_config: 16/16 parameters successfully set 
+[CONFIG] log_config: 16/16 parameters successfully set 
+log init done
+[CONFIG] loader: 2/2 parameters successfully set 
+[CONFIG] loader.ldpc: 1/2 parameters successfully set 
+[LOADER] library libldpc_cl.so successfully loaded
+[HW]   Platform 0, OpenCL profile FULL_PROFILE
+[HW]   Platform 0, OpenCL version OpenCL 2.1 LINUX
+[HW]   Device 0 is  available
+[HW]   Device 0, type 2 = 0x00000002: cpu 
+[HW]   Device 0, number of Compute Units: 8
+[HW]   Device 0, max Work Items dimension: 3
+[HW]   Device 0, max Work Items size for dimension: 0 8192
+[HW]   Device 0, max Work Items size for dimension: 1 8192
+[HW]   Device 0, max Work Items size for dimension: 2 8192
+[HW]   Platform 1, OpenCL profile FULL_PROFILE
+[HW]   Platform 1, OpenCL version OpenCL 2.0 beignet 1.3
+[HW]   Device 0 is  available
+[HW]   Device 0, type 4 = 0x00000004: gpu 
+[HW]   Device 0, number of Compute Units: 20
+[HW]   Device 0, max Work Items dimension: 3
+[HW]   Device 0, max Work Items size for dimension: 0 512
+[HW]   Device 0, max Work Items size for dimension: 1 512
+[HW]   Device 0, max Work Items size for dimension: 2 512
+................................
+```
+```
+
+
+
+### LDPC libraries
+Libraries implementing the LDPC algorithms must be named `libldpc<_version>.so`, they must implement three functions: `nrLDPC_initcall` `nrLDPC_decod` and `nrLDPC_encod`. The prototypes for these functions is defined in [nrLDPC_defs.h](file://nrLDPC_defs.h).
+
+`libldpc_cuda.so`has been tested with the `ldpctest` executable, usage from the softmodem's has to be tested.

-###LDPC libraries
-Libraries implementing the LDPC algorithms must be named `libldpc<_version>.so`, they must implement two functions: `nrLDPC_decod` and `nrLDPC_encod`. The prototypes for these functions is defined in [nrLDPC_defs.h](file://nrLDPC_defs.h).
+`libldpc_cl`is under development.

 [oai Wikis home](https://gitlab.eurecom.fr/oai/openairinterface5g/wikis/home)
--- a/openair1/PHY/CODING/TESTBENCH/ldpctest.c
+++ b/openair1/PHY/CODING/TESTBENCH/ldpctest.c
@@ -101,8 +101,8 @@ int test_ldpc(short No_iteration,
              unsigned int *crc_misses,
              time_stats_t *time_optim,
              time_stats_t *time_decoder,
-              n_iter_stats_t *dec_iter,
-              short run_cuda)
+              n_iter_stats_t *dec_iter
+              )
 {
  //clock initiate
  //time_stats_t time,time_optim,tinput,tprep,tparity,toutput, time_decoder;
@@ -393,27 +393,12 @@ int test_ldpc(short No_iteration,
      decParams.R=code_rate_vec[R_ind];//13;
      decParams.numMaxIter=No_iteration;
      decParams.outMode = nrLDPC_outMode_BIT;
+      decParams.block_length=block_length;
      //decParams.outMode =nrLDPC_outMode_LLRINT8;
-#ifdef CUDA_FLAG
-	  set_compact_BG(Zc,BG);
-	  init_LLR_DMA_for_CUDA(&decParams, (int8_t*)channel_output_fixed[j], (int8_t*)estimated_output[j], block_length);
-#endif
+	  nrLDPC_initcall(&decParams, (int8_t*)channel_output_fixed[j], (int8_t*)estimated_output[j]);
 	  for(j=0;j<n_segments;j++) {
    	  start_meas(time_decoder);
-#ifdef CUDA_FLAG
-        if(run_cuda){
-          n_iter = nrLDPC_decoder_LYC(&decParams, (int8_t*)channel_output_fixed[j], (int8_t*)estimated_output[j], block_length, time_decoder);
-        }  
-        else{ 
-        // decode the sequence
-        // decoder supports BG2, Z=128 & 256
-        //esimated_output=ldpc_decoder(channel_output_fixed, block_length, No_iteration, (double)((float)nom_rate/(float)denom_rate));
-        ///nrLDPC_decoder(&decParams, channel_output_fixed, estimated_output, NULL);
-          n_iter = nrLDPC_decoder(&decParams, (int8_t*)channel_output_fixed[j], (int8_t*)estimated_output[j], p_nrLDPC_procBuf, p_decoder_profiler);
-        }
-#else
          n_iter = nrLDPC_decoder(&decParams, (int8_t*)channel_output_fixed[j], (int8_t*)estimated_output[j], p_nrLDPC_procBuf, p_decoder_profiler);
-#endif
 	      stop_meas(time_decoder);
      }

@@ -514,17 +499,14 @@ int test_ldpc(short No_iteration,

 int main(int argc, char *argv[])
 {
-#ifdef CUDA_FLAG	
-  warmup_for_GPU();
-#endif
+
  unsigned int errors, errors_bit, crc_misses;
  double errors_bit_uncoded;
  short block_length=8448; // decoder supports length: 1201 -> 1280, 2401 -> 2560
-
+  char *ldpc_version=NULL; /* version of the ldpc decoder library to use (XXX suffix to use when loading libldpc_XXX.so */
  short No_iteration=5;
  int n_segments=1;
  //double rate=0.333;
-  short run_cuda = 0;
  
  int nom_rate=1;
  int denom_rate=3;
@@ -544,7 +526,7 @@ int main(int argc, char *argv[])

  short BG=0,Zc,Kb=0;

-  while ((c = getopt (argc, argv, "q:r:s:S:l:G:n:d:i:t:u:h")) != -1)
+  while ((c = getopt (argc, argv, "q:r:s:S:l:G:n:d:i:t:u:hv:")) != -1)
    switch (c)
    {
      case 'q':
@@ -564,7 +546,7 @@ int main(int argc, char *argv[])
        break;
 		
      case 'G':
-        run_cuda = atoi(optarg);
+        ldpc_version="_cuda";
        break;

      case 'n':
@@ -590,7 +572,9 @@ int main(int argc, char *argv[])
      case 'u':
        test_uncoded = atoi(optarg);
        break;
-
+      case 'v':
+          ldpc_version=strdup(optarg);
+        break;
      case 'h':
      default:
              printf("CURRENTLY SUPPORTED CODE RATES: \n");
@@ -609,6 +593,7 @@ int main(int argc, char *argv[])
              printf("-t SNR simulation step, Default: 0.1\n");
              printf("-i Max decoder iterations, Default: 5\n");
              printf("-u Set SNR per coded bit, Default: 0\n");
+              printf("-v XXX Set ldpc shared library version. libldpc_XXX.so will be used \n");
              exit(1);
              break;
    }
@@ -619,7 +604,10 @@ int main(int argc, char *argv[])
  printf("SNR0 %f: \n", SNR0);


-  load_nrLDPClib();
+  if (ldpc_version != NULL)
+    load_nrLDPClib(ldpc_version);
+  else
+    load_nrLDPClib(NULL); 
  load_nrLDPClib_ref("_orig", &encoder_orig);
  //for (block_length=8;block_length<=MAX_BLOCK_LENGTH;block_length+=8)

@@ -691,8 +679,7 @@ int main(int argc, char *argv[])
                                &crc_misses,
                                time_optim,
                                time_decoder,
-                                dec_iter,
-								run_cuda);
+                                dec_iter);

    printf("SNR %f, BLER %f (%u/%d)\n", SNR, (float)decoded_errors[i]/(float)n_trials, decoded_errors[i], n_trials);
    printf("SNR %f, BER %f (%u/%d)\n", SNR, (float)errors_bit/(float)n_trials/(float)block_length/(double)n_segments, decoded_errors[i], n_trials);

--- a/openair1/PHY/CODING/nrLDPC_decoder/nrLDPC_decoder.c
+++ b/openair1/PHY/CODING/nrLDPC_decoder/nrLDPC_decoder.c
@@ -46,7 +46,8 @@
 #endif

 static inline uint32_t nrLDPC_decoder_core(int8_t* p_llr, int8_t* p_out, t_nrLDPC_procBuf* p_procBuf, uint32_t numLLR, t_nrLDPC_lut* p_lut, t_nrLDPC_dec_params* p_decParams, t_nrLDPC_time_stats* p_profiler);
-
+void nrLDPC_initcall(t_nrLDPC_dec_params* p_decParams, int8_t* p_llr, int8_t* p_out) {
+}
 int32_t nrLDPC_decod(t_nrLDPC_dec_params* p_decParams, int8_t* p_llr, int8_t* p_out, t_nrLDPC_procBuf* p_procBuf, t_nrLDPC_time_stats* p_profiler)
 {
    uint32_t numLLR;

--- a/openair1/PHY/CODING/nrLDPC_decoder/nrLDPC_decoder_CL.c
+++ b/openair1/PHY/CODING/nrLDPC_decoder/nrLDPC_decoder_CL.c
+/*
+ * Licensed to the OpenAirInterface (OAI) Software Alliance under one or more
+ * contributor license agreements.  See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The OpenAirInterface Software Alliance licenses this file to You under
+ * the OAI Public License, Version 1.0  (the "License"); you may not use this file
+ * except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *      http://www.openairinterface.org/?page_id=698
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ *-------------------------------------------------------------------------------
+ * For more information about the OpenAirInterface (OAI) Software Alliance:
+ *      contact@openairinterface.org
+ */
+ /*! \file nrLDPC_decoder_CL.c
+* \brief ldpc decoder, openCL implementaion
+* \author Francois TABURET
+* \date 2021
+* \version 1.0
+* \note initial implem - translation of cuda version
+*/
+/* uses HW  component id for log messages ( --log_config.hw_log_level <warning| info|debug|trace>) */
+#include <stdio.h>
+#include <unistd.h>
+#include <cuda_runtime.h>
+#include <CL/opencl.h>
+#include "PHY/CODING/nrLDPC_decoder/nrLDPC_types.h"
+#include "PHY/CODING/nrLDPC_decoder/nrLDPCdecoder_defs.h"
+#include "assertions.h"
+#include "common/utils/LOG/log.h"
+
+#define MAX_ITERATION 2
+#define MC	1
+
+#define MAX_OCLDEV   10
+#define MAX_OCLRUNTIME 5
+
+
+#define CLSETKERNELARG(A,B,C,D) \
+rt=clSetKernelArg(A,B,C,D) ;\
+AssertFatal(rt == CL_SUCCESS, "Error %d setting kernel argument index %d\n" , (int)rt, B);
+
+typedef struct{
+  char x;
+  char y;
+  short value;
+} h_element;
+#include "../nrLDPC_decoder_LYC/bgs/BG1_compact_in_C.h"
+
+typedef struct{
+  cl_uint max_CU;
+  cl_uint max_WID;
+  size_t  *max_WIS;
+} ocldev_t;
+
+typedef struct {
+  cl_kernel cnp_kernel_1st;
+  cl_kernel cnp_kernel;
+  cl_kernel vnp_kernel_normal;
+  cl_kernel pack_decoded_bit;
+} oclkernels_t;
+
+typedef struct{
+  cl_uint num_devices;
+  cl_device_id devices[MAX_OCLDEV];
+  ocldev_t ocldev[MAX_OCLDEV];
+  cl_context context;
+  cl_program program;
+  cl_mem dev_h_compact1;
+  cl_mem dev_h_compact2;
+  cl_mem dev_const_llr;
+  cl_mem dev_llr;
+  cl_mem dev_dt;
+  cl_mem dev_tmp;  
+  oclkernels_t kernels[MAX_OCLDEV];
+  cl_command_queue queue[MAX_OCLDEV];
+} oclruntime_t;
+
+typedef struct{
+  oclruntime_t runtime[MAX_OCLRUNTIME];
+} ocl_t;
+
+
+
+ocl_t ocl;
+
+
+void set_compact_BG(int Zc,short BG){
+	cl_uint rt;
+	int row,col;
+	if(BG == 1){
+		row = 46;
+		col = 68;
+	}
+	else{
+		row = 42;
+		col = 52;
+	}
+	int compact_row = 30; 
+	int compact_col = 19;
+	if(BG==2){compact_row = 10, compact_col = 23;}
+	int memorySize_h_compact1 = row * compact_col * sizeof(h_element);
+	int memorySize_h_compact2 = compact_row * col * sizeof(h_element);
+	int lift_index = 0;
+	short lift_set[][9] = {
+		{2,4,8,16,32,64,128,256},
+		{3,6,12,24,48,96,192,384},
+		{5,10,20,40,80,160,320},
+		{7,14,28,56,112,224},
+		{9,18,36,72,144,288},
+		{11,22,44,88,176,352},
+		{13,26,52,104,208},
+		{15,30,60,120,240},
+		{0}
+	};
+	
+	for(int i = 0; lift_set[i][0] != 0; i++){
+		for(int j = 0; lift_set[i][j] != 0; j++){
+			if(Zc == lift_set[i][j]){
+				lift_index = i;
+				break;
+			}
+		}
+	}
+	printf("\nZc = %d BG = %d\n",Zc,BG);
+    ocl.runtime[0].dev_h_compact1 = clCreateBuffer(ocl.runtime[0].context, CL_MEM_READ_ONLY|CL_MEM_HOST_WRITE_ONLY, memorySize_h_compact1, NULL, (cl_int *)&rt);
+    AssertFatal(rt == CL_SUCCESS, "Error %d creating buffer dev_h_compact1 for platform %i \n" , (int)rt, 0);	
+    ocl.runtime[0].dev_h_compact2 = clCreateBuffer(ocl.runtime[0].context, CL_MEM_READ_ONLY|CL_MEM_HOST_WRITE_ONLY, memorySize_h_compact2, NULL, (cl_int *)&rt);
+    AssertFatal(rt == CL_SUCCESS, "Error %d creating buffer dev_h_compact2 for platform %i \n" , (int)rt, 0);  
+    h_element *h1;
+    h_element *h2; 
+	switch(lift_index){
+			case 0:
+				h1 = host_h_compact1_I0;
+				h2 = host_h_compact2_I0;
+				break;
+			case 1:
+				h1 = host_h_compact1_I1;
+				h2 = host_h_compact2_I1;
+				break;
+			case 2:
+				h1 = host_h_compact1_I2;
+				h2 = host_h_compact2_I2;
+				break;
+			case 3:
+				h1 = host_h_compact1_I3;
+				h2 = host_h_compact2_I3;
+				break;
+			case 4:
+				h1 = host_h_compact1_I4;
+				h2 = host_h_compact2_I4;
+				break;
+			case 5:
+				h1 = host_h_compact1_I5;
+				h2 = host_h_compact2_I5;
+				break;
+			case 6:
+				h1 = host_h_compact1_I6;
+				h2 = host_h_compact2_I6;
+				break;
+			case 7:
+				h1 = host_h_compact1_I7;
+				h2 = host_h_compact2_I7;
+				break;
+			default:
+				AssertFatal(0, "Invalid lift_index value %i\n" , lift_index); 
+				break;
+		}
+     rt = clEnqueueWriteBuffer(ocl.runtime[0].queue[0], ocl.runtime[0].dev_h_compact1, CL_TRUE, 0,memorySize_h_compact1, h1, 0, NULL, NULL);
+	 AssertFatal(rt == CL_SUCCESS, "Error %d moving  h_compact1 memory to pltf %i dev %i\n" , (int)rt, 0,0); 
+     rt = clEnqueueWriteBuffer(ocl.runtime[0].queue[0], ocl.runtime[0].dev_h_compact2, CL_TRUE, 0,memorySize_h_compact2, h2, 0, NULL, NULL);
+	 AssertFatal(rt == CL_SUCCESS, "Error %d moving  h_compact2 memory to pltf %i dev %i\n" , (int)rt, 0,0); 	 	
+	// return 0;
+}
+void cl_error_callback(const char* errinfo, const void* private_info, size_t cb, void* user_data) {
+  oclruntime_t *runtime = (oclruntime_t *)user_data;
+  LOG_E(HW,"OpenCL accelerator error  %s\n", errinfo );
+}
+
+char *clutil_getstrdev(int intdev) {
+  static char retstring[255]="";
+  char *retptr=retstring;
+  retptr+=sprintf(retptr,"0x%08x: ",(uint32_t)intdev);
+  if (intdev & CL_DEVICE_TYPE_CPU)
+	  retptr+=sprintf(retptr,"%s","cpu ");
+  if (intdev & CL_DEVICE_TYPE_GPU)
+	  retptr+=sprintf(retptr,"%s","gpu ");
+  if (intdev & CL_DEVICE_TYPE_ACCELERATOR)
+	  retptr+=sprintf(retptr,"%s","acc ");  
+  return retstring;
+}
+
+void get_CompilErr(cl_program program, int pltf) {
+
+    // Determine the size of the log
+    size_t log_size;
+    for(int i=0; i<ocl.runtime[pltf].num_devices;i++) {
+      clGetProgramBuildInfo(program, ocl.runtime[pltf].devices[i], CL_PROGRAM_BUILD_LOG, 0, NULL, &log_size);
+    // Allocate memory for the log
+      char *log = (char *) malloc(log_size);
+    // Get the log
+      clGetProgramBuildInfo(program, ocl.runtime[pltf].devices[i], CL_PROGRAM_BUILD_LOG, log_size, log, NULL);
+    // Print the log
+      printf("%s\n", log);
+      free(log);
+    }
+
+}
+
+size_t load_source(char **source_str) {
+	int MAX_SOURCE_SIZE=(500*132);
+    FILE *fp;
+    size_t source_size;
+ 
+    fp = fopen("nrLDPC_decoder_kernels_CL.cl", "r");
+    AssertFatal(fp,"failed to open cl source: %s\n",strerror(errno));
+    
+    *source_str = (char*)malloc(MAX_SOURCE_SIZE);
+    source_size = fread( *source_str, 1, MAX_SOURCE_SIZE, fp);
+    fclose( fp );
+    return source_size;
+}
+
+/* from here: entry points in decoder shared lib */
+int ldpc_autoinit(void) {   // called by the library loader 
+  cl_platform_id platforms[10];
+  cl_uint         num_platforms_found;
+  int             context_ok=0;
+  cl_uint rt = clGetPlatformIDs( sizeof(platforms)/sizeof(cl_platform_id), platforms, &num_platforms_found );
+  AssertFatal(rt == CL_SUCCESS, "clGetPlatformIDs error %d\n" , (int)rt);
+  AssertFatal( num_platforms_found>0 , "clGetPlatformIDs: no cl compatible platform found\n");
+  for (int i=0 ; i<(int)num_platforms_found ; i++) {
+	  char stringval[255];
+	  rt = clGetPlatformInfo(platforms[i],CL_PLATFORM_PROFILE, sizeof(stringval),stringval,NULL);	
+	  AssertFatal(rt == CL_SUCCESS, "clGetPlatformInfo PROFILE error %d\n" , (int)rt);  
+	  LOG_I(HW,"Platform %i, OpenCL profile %s\n", i,stringval );
+	  rt = clGetPlatformInfo(platforms[i],CL_PLATFORM_VERSION, sizeof(stringval),stringval,NULL);	
+	  AssertFatal(rt == CL_SUCCESS, "clGetPlatformInfo VERSION error %d\n" , (int)rt);  
+	  LOG_I(HW,"Platform %i, OpenCL version %s\n", i,stringval );	  
+	  rt = clGetDeviceIDs(platforms[i],CL_DEVICE_TYPE_ALL, sizeof(ocl.runtime[i].devices)/sizeof(cl_device_id),ocl.runtime[i].devices,&(ocl.runtime[i].num_devices));
+	  AssertFatal(rt == CL_SUCCESS, "clGetDeviceIDs error %d\n" , (int)rt);
+	  int devok=0;
+	  for (int j=0; j<ocl.runtime[i].num_devices; j++) {
+		cl_bool abool;
+		rt = clGetDeviceInfo(ocl.runtime[i].devices[j],CL_DEVICE_AVAILABLE, sizeof(abool),&abool,NULL);
+		AssertFatal(rt == CL_SUCCESS, "clGetDeviceInfo DEVICE_AVAILABLE error %d\n" , (int)rt); 
+		LOG_I(HW,"Device %i is %s available\n", j, (abool==CL_TRUE?"":"not"));
+		cl_device_type devtype;		
+		rt = clGetDeviceInfo(ocl.runtime[i].devices[j],CL_DEVICE_TYPE, sizeof(cl_device_type),&devtype,NULL);
+		AssertFatal(rt == CL_SUCCESS, "clGetDeviceInfo DEVICE_TYPE error %d\n" , (int)rt); 
+		LOG_I(HW,"Device %i, type %d = %s\n", j,(int)devtype, clutil_getstrdev(devtype));
+		rt = clGetDeviceInfo(ocl.runtime[i].devices[j],CL_DEVICE_MAX_COMPUTE_UNITS,sizeof(ocl.runtime[i].ocldev[j].max_CU),&(ocl.runtime[i].ocldev[j].max_CU),NULL);
+		AssertFatal(rt == CL_SUCCESS, "clGetDeviceInfo MAX_COMPUTE_UNITS error %d\n" , (int)rt);
+		LOG_I(HW,"Device %i, number of Compute Units: %d\n", j,ocl.runtime[i].ocldev[j].max_CU);
+		rt = clGetDeviceInfo(ocl.runtime[i].devices[j],CL_DEVICE_MAX_WORK_ITEM_DIMENSIONS,sizeof(ocl.runtime[i].ocldev[j].max_WID),&(ocl.runtime[i].ocldev[j].max_WID),NULL);
+		AssertFatal(rt == CL_SUCCESS, "clGetDeviceInfo MAX_WORK_ITEM_DIMENSIONS error %d\n" , (int)rt);
+		LOG_I(HW,"Device %i, max Work Items dimension: %d\n", j,ocl.runtime[i].ocldev[j].max_WID);	
+		ocl.runtime[i].ocldev[j].max_WIS = (size_t *)malloc(ocl.runtime[i].ocldev[j].max_WID * sizeof(size_t));
+		rt = clGetDeviceInfo(ocl.runtime[i].devices[j],CL_DEVICE_MAX_WORK_ITEM_SIZES,sizeof(ocl.runtime[i].ocldev[j].max_WID)*sizeof(size_t),ocl.runtime[i].ocldev[j].max_WIS,NULL);
+		AssertFatal(rt == CL_SUCCESS, "clGetDeviceInfo MAX_WORK_ITEM_SIZES error %d\n" , (int)rt);
+		for(int k=0; k<ocl.runtime[i].ocldev[j].max_WID;k++)
+		  LOG_I(HW,"Device %i, max Work Items size for dimension: %d %u\n", j,k,(uint32_t)ocl.runtime[i].ocldev[j].max_WIS[k]); 
+		devok++;
+      }
+      if (devok >0) {
+        ocl.runtime[i].context = clCreateContext(NULL, ocl.runtime[i].num_devices, ocl.runtime[i].devices, cl_error_callback, &(ocl.runtime[i]), (cl_int *)&rt); 
+        AssertFatal(rt == CL_SUCCESS, "Error %d creating context for platform %i\n" , (int)rt, i);
+        for(int dev=0; dev<ocl.runtime[i].num_devices; dev++) {
+          ocl.runtime[i].queue[dev] = clCreateCommandQueueWithProperties(ocl.runtime[i].context,ocl.runtime[i].devices[dev] , 0, (cl_int *)&rt);
+          AssertFatal(rt == CL_SUCCESS, "Error %d creating command queue for platform %i device %i\n" , (int)rt, i,dev);
+        }
+        ocl.runtime[i].dev_const_llr = clCreateBuffer(ocl.runtime[i].context, CL_MEM_READ_ONLY|CL_MEM_HOST_WRITE_ONLY, 68*384, NULL, (cl_int *)&rt);
+        AssertFatal(rt == CL_SUCCESS, "Error %d creating buffer dev_const_llr for platform %i \n" , (int)rt, i);
+        ocl.runtime[i].dev_llr = clCreateBuffer(ocl.runtime[i].context, CL_MEM_READ_WRITE|CL_MEM_HOST_WRITE_ONLY, 68*384, NULL, (cl_int *)&rt);
+        AssertFatal(rt == CL_SUCCESS, "Error %d creating buffer dev_llr for platform %i \n" , (int)rt, i); 
+        ocl.runtime[i].dev_dt = clCreateBuffer(ocl.runtime[i].context, CL_MEM_READ_WRITE|CL_MEM_HOST_NO_ACCESS, 46*68*384, NULL, (cl_int *)&rt);
+        AssertFatal(rt == CL_SUCCESS, "Error %d creating buffer dev_dt for platform %i \n" , (int)rt, i);                
+        ocl.runtime[i].dev_tmp = clCreateBuffer(ocl.runtime[i].context, CL_MEM_READ_ONLY|CL_MEM_HOST_WRITE_ONLY, 68*384, NULL, (cl_int *)&rt);
+        AssertFatal(rt == CL_SUCCESS, "Error %d creating buffer dev_tmp for platform %i \n" , (int)rt, i);      
+        char *source_str;
+        size_t source_size=load_source(&source_str);      
+        cl_program program = clCreateProgramWithSource(ocl.runtime[i].context, 1, 
+                                                       (const char **)&source_str, (const size_t *)&source_size,  (cl_int *)&rt);
+        AssertFatal(rt == CL_SUCCESS, "Error %d creating program for platform %i \n" , (int)rt, i); 
+        rt = clBuildProgram(program, ocl.runtime[i].num_devices,ocl.runtime[i].devices, NULL, NULL, NULL);  
+        if (rt == CL_BUILD_PROGRAM_FAILURE) {
+          get_CompilErr(program,i);
+	    } 
+        AssertFatal(rt == CL_SUCCESS, "Error %d buildding program for platform %i \n" , rt, i); 
+        
+	    for(int dev=0; dev<ocl.runtime[i].num_devices; dev++) {
+		  ocl.runtime[i].kernels[dev].cnp_kernel_1st = clCreateKernel(program, "ldpc_cnp_kernel_1st_iter", (cl_int *)&rt);
+		  AssertFatal(rt == CL_SUCCESS, "Error %d creating kernel %s platform %i, dev %i\n" , (int)rt,"ldpc_cnp_kernel_1st_iter", i,dev); 
+		  ocl.runtime[i].kernels[dev].cnp_kernel = clCreateKernel(program, "ldpc_cnp_kernel", (cl_int *)&rt);
+		  AssertFatal(rt == CL_SUCCESS, "Error %d creating kernel %s platform %i, dev %i\n" , (int)rt,"ldpc_cnp_kernel", i,dev); 
+		  ocl.runtime[i].kernels[dev].vnp_kernel_normal = clCreateKernel(program, "ldpc_vnp_kernel_normal", (cl_int *)&rt);
+		  AssertFatal(rt == CL_SUCCESS, "Error %d creating kernel %s platform %i, dev %i\n" , (int)rt,"ldpc_vnp_kernel_normal", i,dev); 
+		  ocl.runtime[i].kernels[dev].pack_decoded_bit = clCreateKernel(program, "pack_decoded_bit", (cl_int *)&rt);
+		  AssertFatal(rt == CL_SUCCESS, "Error %d creating kernel %s platform %i, dev %i\n" , (int)rt,"pack_decoded_bit", i,dev); 		  		  		  
+        }
+        context_ok++;
+      }
+      devok=0;
+  }
+  AssertFatal(context_ok>0, "No openCL device available to accelerate ldpc\n"); 
+  return 0;  
+}
+
+
+void nrLDPC_initcall(t_nrLDPC_dec_params* p_decParams, int8_t* p_llr, int8_t* p_out) {
+	set_compact_BG(p_decParams->Z,p_decParams->BG);
+//	init_LLR_DMA(p_decParams, p_llr,  p_out);
+}
+
+
+int32_t nrLDPC_decod(t_nrLDPC_dec_params* p_decParams, int8_t* p_llr, int8_t* p_out,t_nrLDPC_procBuf* p_procBuf, t_nrLDPC_time_stats *time_decoder)
+{
+    uint16_t Zc          = p_decParams->Z;
+    uint8_t  BG         = p_decParams->BG;
+//    uint8_t  numMaxIter = p_decParams->numMaxIter;
+    int block_length    = p_decParams->block_length;
+//    e_nrLDPC_outMode outMode = p_decParams->outMode;
+	uint8_t row,col;
+	if(BG == 1){
+		row = 46;
+		col = 68;
+	}
+	else{
+		row = 42;
+		col = 52;
+	}
+
+//	alloc memory
+//	unsigned char *hard_decision = (unsigned char*)p_out;
+//	gpu
+	int memorySize_llr = col * Zc * sizeof(char) * MC;
+//	cudaCheck( cudaMemcpyToSymbol(dev_const_llr, p_llr, memorySize_llr_cuda) );
+//	cudaCheck( cudaMemcpyToSymbol(dev_llr, p_llr, memorySize_llr_cuda) );
+    int rt = clEnqueueWriteBuffer(ocl.runtime[0].queue[0], ocl.runtime[0].dev_const_llr, CL_TRUE, 0,
+                               memorySize_llr, p_llr, 0, NULL, NULL);
+	AssertFatal(rt == CL_SUCCESS, "Error %d moving p_llr data to  read only memory in pltf %i dev %i\n" , (int)rt, 0,0); 
+    rt = clEnqueueWriteBuffer(ocl.runtime[0].queue[0], ocl.runtime[0].dev_llr, CL_TRUE, 0,
+                               memorySize_llr, p_llr, 0, NULL, NULL);
+	AssertFatal(rt == CL_SUCCESS, "Error %d moving p_llr data to  read-write memory in pltf %i dev %i\n" , (int)rt, 0,0); 	
+// Define CUDA kernel dimension
+//	int blockSizeX = Zc;
+//	dim3 dimGridKernel1(row, MC, 1); 	// dim of the thread blocks
+//	dim3 dimBlockKernel1(blockSizeX, 1, 1);
+
+//    dim3 dimGridKernel2(col, MC, 1);
+//    dim3 dimBlockKernel2(blockSizeX, 1, 1);	
+//	cudaDeviceSynchronize();
+
+// lauch kernel 
+    size_t global_item_sizek0[2] = {row*Zc,MC}; // Process the entire lists
+    size_t global_item_sizek1[2] = {col*Zc,MC}; // Process the entire lists
+    size_t local_item_sizek[2] = {128,1}; // Divide work items into groups of 128
+    
+	for(int ii = 0; ii < MAX_ITERATION; ii++){
+		// first kernel	
+		if(ii == 0){
+//			ldpc_cnp_kernel_1st_iter 
+//			<<<dimGridKernel1, dimBlockKernel1>>>
+//			( BG, row, col, Zc);
+	      CLSETKERNELARG(ocl.runtime[0].kernels[0].cnp_kernel_1st, 0, sizeof(cl_mem), (void *)&(ocl.runtime[0].dev_llr));
+	      CLSETKERNELARG(ocl.runtime[0].kernels[0].cnp_kernel_1st, 1, sizeof(cl_mem), (void *)&(ocl.runtime[0].dev_dt));
+	      CLSETKERNELARG(ocl.runtime[0].kernels[0].cnp_kernel_1st, 2, sizeof(cl_mem), (void *)&(ocl.runtime[0].dev_h_compact1));
+	      CLSETKERNELARG(ocl.runtime[0].kernels[0].cnp_kernel_1st, 3, sizeof(int), (void *)&(BG));
+	      CLSETKERNELARG(ocl.runtime[0].kernels[0].cnp_kernel_1st, 4, sizeof(int), (void *)&(row));	 	      
+	      CLSETKERNELARG(ocl.runtime[0].kernels[0].cnp_kernel_1st, 5, sizeof(int), (void *)&(col));
+	      CLSETKERNELARG(ocl.runtime[0].kernels[0].cnp_kernel_1st, 6, sizeof(int), (void *)&(Zc));	      
+          rt = clEnqueueNDRangeKernel(ocl.runtime[0].queue[0], ocl.runtime[0].kernels[0].cnp_kernel_1st, 2, NULL, 
+                                      global_item_sizek0, local_item_sizek, 0, NULL, NULL);
+	      AssertFatal(rt == CL_SUCCESS, "Error %d enqueing cnp_kernel_1st \n" , (int)rt);  
+		}else{
+//		 	ldpc_cnp_kernel
+//			<<<dimGridKernel1, dimBlockKernel1>>>
+//			( BG, row, col, Zc);
+	      CLSETKERNELARG(ocl.runtime[0].kernels[0].cnp_kernel, 0, sizeof(cl_mem), (void *)&(ocl.runtime[0].dev_llr));
+	      CLSETKERNELARG(ocl.runtime[0].kernels[0].cnp_kernel, 1, sizeof(cl_mem), (void *)&(ocl.runtime[0].dev_dt));
+	      CLSETKERNELARG(ocl.runtime[0].kernels[0].cnp_kernel, 2, sizeof(cl_mem), (void *)&(ocl.runtime[0].dev_h_compact1));
+	      CLSETKERNELARG(ocl.runtime[0].kernels[0].cnp_kernel, 3, sizeof(int), (void *)&(BG));
+	      CLSETKERNELARG(ocl.runtime[0].kernels[0].cnp_kernel, 4, sizeof(int), (void *)&(row));	 	      
+	      CLSETKERNELARG(ocl.runtime[0].kernels[0].cnp_kernel, 5, sizeof(int), (void *)&(col));
+	      CLSETKERNELARG(ocl.runtime[0].kernels[0].cnp_kernel, 6, sizeof(int), (void *)&(Zc));	  
+          rt = clEnqueueNDRangeKernel(ocl.runtime[0].queue[0], ocl.runtime[0].kernels[0].cnp_kernel, 2, NULL, 
+                                      global_item_sizek0, local_item_sizek, 0, NULL, NULL);
+	      AssertFatal(rt == CL_SUCCESS, "Error %d enqueing  cnp_kernel\n" , (int)rt);  
+		}
+		// second kernel
+//		ldpc_vnp_kernel_normal
+//		<<<dimGridKernel2, dimBlockKernel2>>>
+//		// (dev_llr, dev_const_llr,BG, row, col, Zc);
+//		(BG, row, col, Zc);
+	  CLSETKERNELARG(ocl.runtime[0].kernels[0].vnp_kernel_normal, 0, sizeof(cl_mem), (void *)&(ocl.runtime[0].dev_llr));
+	  CLSETKERNELARG(ocl.runtime[0].kernels[0].vnp_kernel_normal, 1, sizeof(cl_mem), (void *)&(ocl.runtime[0].dev_dt));
+	  CLSETKERNELARG(ocl.runtime[0].kernels[0].vnp_kernel_normal, 2, sizeof(cl_mem), (void *)&(ocl.runtime[0].dev_const_llr));
+	  CLSETKERNELARG(ocl.runtime[0].kernels[0].vnp_kernel_normal, 3, sizeof(cl_mem), (void *)&(ocl.runtime[0].dev_h_compact2));
+	  CLSETKERNELARG(ocl.runtime[0].kernels[0].vnp_kernel_normal, 4, sizeof(int), (void *)&(BG));
+	  CLSETKERNELARG(ocl.runtime[0].kernels[0].vnp_kernel_normal, 5, sizeof(int), (void *)&(row));	 	      
+	  CLSETKERNELARG(ocl.runtime[0].kernels[0].vnp_kernel_normal, 6, sizeof(int), (void *)&(col));
+	  CLSETKERNELARG(ocl.runtime[0].kernels[0].vnp_kernel_normal, 7, sizeof(int), (void *)&(Zc));	 
+      rt = clEnqueueNDRangeKernel(ocl.runtime[0].queue[0], ocl.runtime[0].kernels[0].vnp_kernel_normal, 2, NULL, 
+                                      global_item_sizek1, local_item_sizek, 0, NULL, NULL);
+	  AssertFatal(rt == CL_SUCCESS, "Error %d enqueing vnp_kernel_normal \n" , (int)rt);  
+	}
+	
+//   int pack = (block_length/128)+1;
+//	dim3 pack_block(pack, MC, 1);
+//	pack_decoded_bit<<<pack_block,128>>>( col, Zc);
+	CLSETKERNELARG(ocl.runtime[0].kernels[0].pack_decoded_bit, 0, sizeof(cl_mem), (void *)&(ocl.runtime[0].dev_llr));
+	CLSETKERNELARG(ocl.runtime[0].kernels[0].pack_decoded_bit, 1, sizeof(cl_mem), (void *)&(ocl.runtime[0].dev_tmp));
+	CLSETKERNELARG(ocl.runtime[0].kernels[0].pack_decoded_bit, 2, sizeof(int), (void *)&(col));
+	CLSETKERNELARG(ocl.runtime[0].kernels[0].pack_decoded_bit, 3, sizeof(int), (void *)&(Zc));
+ 
+    // Execute the OpenCL kernel on the list
+    size_t global_item_size[2] = {block_length,MC}; // Process the entire lists
+    size_t local_item_size[2] = {128,1}; // Divide work items into groups of 128
+    rt = clEnqueueNDRangeKernel(ocl.runtime[0].queue[0], ocl.runtime[0].kernels[0].pack_decoded_bit, 2, NULL, 
+            global_item_size, local_item_size, 0, NULL, NULL);
+	AssertFatal(rt == CL_SUCCESS, "Error %d enqueing pack_decoded_bit \n" , (int)rt);             
+//	cudaCheck( cudaMemcpyFromSymbol((void*)hard_decision, (const void*)dev_tmp, (block_length/8)*sizeof(unsigned char)) );
+//	cudaDeviceSynchronize();
+	
+    rt = clEnqueueReadBuffer(ocl.runtime[0].queue[0], ocl.runtime[0].dev_tmp, CL_TRUE, 0,
+                              (block_length/8)*sizeof(unsigned char) , p_llr, 0, NULL, NULL);
+	AssertFatal(rt == CL_SUCCESS, "Error %d moving p_llr data to  pltf %i dev %i\n" , (int)rt, 0,0); 
+	return MAX_ITERATION;
+	
+}
--- a/openair1/PHY/CODING/nrLDPC_decoder/nrLDPC_decoder_kernels_CL.cl
+++ b/openair1/PHY/CODING/nrLDPC_decoder/nrLDPC_decoder_kernels_CL.cl
+/*
+ * Licensed to the OpenAirInterface (OAI) Software Alliance under one or more
+ * contributor license agreements.  See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The OpenAirInterface Software Alliance licenses this file to You under
+ * the OAI Public License, Version 1.0  (the "License"); you may not use this file
+ * except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *      http://www.openairinterface.org/?page_id=698
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ *-------------------------------------------------------------------------------
+ * For more information about the OpenAirInterface (OAI) Software Alliance:
+ *      contact@openairinterface.org
+ */
+
+/*! \file PHY/CODING/nrLDPC_decoder_kernels_CL.cl
+* \brief kernel functions for ldpc decoder accelerated via openCL
+* \author Francois TABURET
+* \date 2021
+* \version 1.0
+* \company Nokia BellLabs France
+* \email: francois.taburet@nokia-bell-labs.com
+* \note initial implem - translation of cuda version
+* \warning
+*/
+#define define MAX_ITERATION 2
+#define MC	1
+
+#define INT32_MAX 2147483647
+
+typedef struct{
+  char x;
+  char y;
+  short value;
+} h_element;
+
+//__global char dev_dt [46*68*384];
+//__local char *dev_t;
+//__global char dev_llr[68*384];
+//__global unsigned char dev_tmp[68*384];
+
+
+
+
+//__constant h_element dev_h_compact1[46*19] = {};  // used in kernel 1
+//__constant h_element dev_h_compact2[68*30] = {};  // used in kernel 2
+
+// __device__ __constantant__ h_element dev_h_compact1[46*19];  // used in kernel 1
+// __device__ __constantant__ h_element dev_h_compact2[68*30];  // used in kernel 2
+
+// row and col element count
+__constant char h_ele_row_bg1_count[46] = {
+	19, 19, 19, 19, 3,  8,  9,  7, 10,  9,
+	7,  8,  7,  6,  7,  7,  6,  6,  6,  6,  
+	6,  6,  5,  5,  6,  5,  5,  4,  5,  5,  
+	5,  5,  5,  5,  5,  5,  5,  4,  5,  5,  
+	4,  5,  4,  5,  5,  4};
+__constant char h_ele_col_bg1_count[68] = {
+	30, 28,  7, 11,  9,  4,  8, 12,  8,  7, 
+	12, 10, 12, 11, 10,  7, 10, 10, 13,  7,  
+	8,  11, 12,  5,  6,  6,  1,  1,  1,  1,  
+	1,   1,  1,  1,  1,  1,  1,  1,  1,  1,  
+	1,   1,  1,  1,  1,  1,  1,  1,  1,  1,  
+	1,   1,  1,  1,  1,  1,  1,  1,  1,  1,  
+	1,   1,  1,  1,  1,  1,  1,  1};
+__constant char h_ele_row_bg2_count[42] = { 
+	8, 10,  8, 10,  4,  6,  6,  6,  4,  5, 
+	5,  5,  4,  5,  5,  4,  5,  5,  4,  4,
+	4,  4,  3,  4,  4,  3,  5,  3,  4,  3,
+	5,  3,  4,  4,  4,  4,  4,  3,  4,  4, 
+	4,  4};
+__constant char h_ele_col_bg2_count[52] = {
+	22, 23, 10,  5,  5, 14,  7, 13,  6,  8, 
+	9,  16,  9, 12,  1,  1,  1,  1,  1,  1,
+	1,   1,  1,  1,  1,  1,  1,  1,  1,  1,
+	1,   1,  1,  1,  1,  1,  1,  1,  1,  1,
+	1,   1,  1,  1,  1,  1,  1,  1,  1,  1, 
+	1,   1};
+
+
+
+// Kernel 1
+__kernel void ldpc_cnp_kernel_1st_iter( __global char * dev_llr, __global char * dev_dt,  __local h_element *dev_h_compact1, int BG, int row, int col, int Zc)
+{
+//	int iMCW = blockIdx.y;		// codeword id
+//	int iBlkRow = blockIdx.x;	// block row in h_base
+//	int iSubRow = threadIdx.x;	// row index in sub_block of h_base
+//	if(blockIdx.x == 0 && threadIdx.x == 1) printf("cnp %d\n", threadIdx.x);
+	int iMCW = get_group_id(1);		// codeword id
+	int iBlkRow = get_group_id(0);	// block row in h_base
+	int iBlkCol;				// block col in h_base
+	int iSubRow = get_local_id(0);;	// row index in sub_block of h_base
+	int iCol;					// overall col index in h_base
+	int offsetR;
+	int shift_t;
+
+//	For 2-min algorithm.
+	int Q_sign = 0;
+	int sq;
+	int Q, Q_abs;
+	int R_temp;
+
+	int sign = 1;
+	int rmin1 = INT32_MAX;
+	int rmin2 = INT32_MAX;
+	char idx_min = 0;
+
+	h_element h_element_t;
+	int s = (BG==1)? h_ele_row_bg1_count[iBlkRow]:h_ele_row_bg2_count[iBlkRow];
+	offsetR = (iMCW * row*col*Zc) + iBlkRow * Zc + iSubRow;	// row*col*Zc = size of dev_dt
+//	if(blockIdx.x == 0 && threadIdx.x == 1) printf("s: %d, offset %d\n", s, offsetR);
+//	The 1st recursion
+	for(int i = 0; i < s; i++) // loop through all the ZxZ sub-blocks in a row
+	{
+		h_element_t = dev_h_compact1[i*row+iBlkRow];	// compact_col == row
+
+		iBlkCol = h_element_t.y;
+		shift_t = h_element_t.value;
+
+		shift_t = (iSubRow + shift_t) % Zc;
+		iCol = (iMCW * col*Zc) + iBlkCol * Zc + shift_t;	// col*Zc = size of llr
+		Q = dev_llr[iCol];
+		Q_abs = (Q>0)? Q : -Q;
+		sq = Q < 0;
+//		if(blockIdx.x == 0 && threadIdx.x == 1) printf("i %d, icol %d, Q: %d\n", i, iCol, Q);
+		// quick version
+		sign = sign * (1 - sq * 2);
+		Q_sign |= sq << i;
+
+		if (Q_abs < rmin1){
+			rmin2 = rmin1;
+			rmin1 = Q_abs;
+			idx_min = i;
+		} else if (Q_abs < rmin2){
+			rmin2 = Q_abs;
+		}
+	}
+
+//	if(blockIdx.x == 0 && threadIdx.x == 1)printf("min1 %d, min2 %d, min1_idx %d\n", rmin1, rmin2, idx_min);
+
+//	The 2nd recursion
+	for(int i = 0; i < s; i++){
+		// v0: Best performance so far. 0.75f is the value of alpha.
+		sq = 1 - 2 * ((Q_sign >> i) & 0x01);
+		R_temp = 0.75f * sign * sq * (i != idx_min ? rmin1 : rmin2);
+		// write results to global memory
+		h_element_t = dev_h_compact1[i*row+iBlkRow];
+		int addr_temp = offsetR + h_element_t.y * row * Zc;
+		dev_dt[addr_temp] = R_temp;
+//		if(blockIdx.x == 0 && threadIdx.x == 1)printf("R_temp %d, temp_addr %d\n", R_temp, addr_temp);
+	}	
+}
+
+// Kernel_1
+__kernel void ldpc_cnp_kernel( __global char * dev_llr, __global char * dev_dt, __local h_element *dev_h_compact1, int BG, int row, int col, int Zc)
+{
+//	if(blockIdx.x == 0 && threadIdx.x == 1) printf("cnp\n");
+//	int iMCW = blockIdx.y;
+//	int iBlkRow = blockIdx.x;	// block row in h_base				// block col in h_base
+//	int iSubRow = threadIdx.x;	// row index in sub_block of h_base
+	int iMCW = get_group_id(1);
+	int iBlkRow = get_group_id(0);	// block row in h_base
+	int iBlkCol; 				// block col in h_base
+	int iSubRow = get_local_id(0);;	// row index in sub_block of h_base
+	int iCol; 					// overall col index in h_base
+	int offsetR;
+	int shift_t;
+
+//	For 2-min algorithm.
+	int Q_sign = 0;
+	int sq;
+	int Q, Q_abs;
+	int R_temp;
+
+	int sign = 1;
+	int rmin1 = INT32_MAX;
+	int rmin2 = INT32_MAX;
+	char idx_min = 0;
+
+	h_element h_element_t;
+	int s = (BG==1)? h_ele_row_bg1_count[iBlkRow]: h_ele_row_bg2_count[iBlkRow];
+	offsetR = (iMCW *row*col*Zc) + iBlkRow * Zc + iSubRow;	// row * col * Zc = size of dev_dt
+//	if(blockIdx.x == 0 && threadIdx.x == 1) printf("s: %d, offset %d\n", s, offsetR);
+//	The 1st recursion
+	for(int i = 0; i < s; i++) // loop through all the ZxZ sub-blocks in a row
+	{
+		h_element_t = dev_h_compact1[i*row+iBlkRow];
+
+		iBlkCol = h_element_t.y;
+		shift_t = h_element_t.value;
+		shift_t = (iSubRow + shift_t) % Zc;
+		iCol = iBlkCol * Zc + shift_t;
+		
+		R_temp = dev_dt[offsetR + iBlkCol * row * Zc];
+
+		Q = dev_llr[iMCW * (col*Zc) + iCol] - R_temp;
+		Q_abs = (Q>0)? Q : -Q;
+//		if(blockIdx.x == 0 && threadIdx.x == 1) printf("i %d, icol %d, Q: %d\n", i, iCol, Q);
+		sq = Q < 0;
+		sign = sign * (1 - sq * 2);
+		Q_sign |= sq << i;
+
+		if (Q_abs < rmin1){
+			rmin2 = rmin1;
+			rmin1 = Q_abs;
+			idx_min = i;
+		} else if (Q_abs < rmin2){
+			rmin2 = Q_abs;
+		}
+		
+	}
+
+//	if(blockIdx.x == 0 && threadIdx.x == 1)printf("min1 %d, min2 %d, min1_idx %d\n", rmin1, rmin2, idx_min);
+	
+//	The 2nd recursion
+	for(int i = 0; i < s; i ++){
+		sq = 1 - 2 * ((Q_sign >> i) & 0x01);
+		R_temp = 0.75f * sign * sq * (i != idx_min ? rmin1 : rmin2);
+		
+
+		// write results to global memory
+		h_element_t = dev_h_compact1[i*row+iBlkRow];
+		int addr_temp = h_element_t.y * row * Zc + offsetR;
+		dev_dt[addr_temp] = R_temp;
+//		if(blockIdx.x == 0 && threadIdx.x == 1)printf("R_temp %d, temp_addr %d\n", R_temp, addr_temp);
+	}
+}
+
+// Kernel 2: VNP processing
+__kernel void
+ldpc_vnp_kernel_normal(__global char * dev_llr, __global char * dev_dt,  __global char * dev_const_llr, __local h_element *dev_h_compact2, int BG, int row, int col, int Zc)
+{
+//	int iMCW = blockIdx.y;
+//	int iBlkCol = blockIdx.x;
+//	int iSubCol = threadIdx.x;	
+	int iMCW = get_group_id(1);
+	int iBlkCol = get_group_id(0);
+	int iBlkRow;
+	int iSubCol = get_local_id(0);
+	int iRow;
+	int iCol;
+
+	int shift_t, sf;
+	int APP;
+
+	h_element h_element_t;
+
+	// update all the llr values
+	iCol = iBlkCol * Zc + iSubCol;
+	APP = dev_const_llr[iMCW *col*Zc + iCol];
+	int offsetDt = iMCW *row*col*Zc + iBlkCol * row * Zc;
+	int s = (BG==1)? h_ele_col_bg1_count[iBlkCol]:h_ele_col_bg2_count[iBlkCol];
+	
+	for(int i = 0; i < s; i++)
+	{
+		h_element_t = dev_h_compact2[i*col+iBlkCol];
+
+		shift_t = h_element_t.value%Zc;
+		iBlkRow = h_element_t.x;
+
+		sf = iSubCol - shift_t;
+		sf = (sf + Zc) % Zc;
+
+		iRow = iBlkRow * Zc + sf;
+		APP = APP + dev_dt[offsetDt + iRow];
+	}
+	if(APP > SCHAR_MAX)	APP = SCHAR_MAX;
+	if(APP < SCHAR_MIN)	APP = SCHAR_MIN;
+	// write back to device global memory
+	dev_llr[iMCW *col*Zc + iCol] = APP;
+}
+
+
+__kernel void pack_decoded_bit(__global unsigned char * dev_llr, __global unsigned char * dev_tmp, int col, int Zc)
+{
+//	int iMCW = blockIdx.y;
+//	int btid = threadIdx.x;
+	unsigned char tmp[128];
+	int iMCW = get_group_id(1);
+	int btid = get_local_id(0);
+	int tid = iMCW * col*Zc + get_group_id(0)*128 + btid;	
+	tmp[btid] = 0;
+	
+	if(dev_llr[tid] < 0){
+		tmp[btid] = 1 << (7-(btid&7));
+	}
+//	__syncthreads();
+	
+	if(btid < 16){
+		dev_tmp[iMCW * col*Zc + get_group_id(0)*16+btid] = 0;
+		for(int i = 0; i < 8; i++){
+			dev_tmp[iMCW * col*Zc + get_group_id(0)*16+btid] += tmp[btid*8+i];
+		}
+	}
+}
+
--- a/openair1/PHY/CODING/nrLDPC_decoder/nrLDPC_types.h
+++ b/openair1/PHY/CODING/nrLDPC_decoder/nrLDPC_types.h
@@ -70,6 +70,7 @@ typedef struct nrLDPC_dec_params {
    uint16_t Z; /**< Lifting size */
    uint8_t R; /**< Decoding rate: Format 15,13,... for code rates 1/5, 1/3,... */
    uint8_t numMaxIter; /**< Maximum number of iterations */
+    int block_length;
    e_nrLDPC_outMode outMode; /**< Output format */
 } t_nrLDPC_dec_params;


--- a/openair1/PHY/CODING/nrLDPC_decoder_LYC/nrLDPC_decoder_LYC.cu
+++ b/openair1/PHY/CODING/nrLDPC_decoder_LYC/nrLDPC_decoder_LYC.cu
@@ -7,14 +7,14 @@
 * \note
 * \warning
 */
-
+#include <iostream>
 #include <stdio.h>
 #include <unistd.h>
 #include <cuda_runtime.h>
 #include <cuda.h>
 #include "PHY/CODING/nrLDPC_decoder/nrLDPC_types.h"
 #include "PHY/CODING/nrLDPC_decoder/nrLDPCdecoder_defs.h"
-
+#include "assertions.h"
 #include "bgs/BG1_I0"
 #include "bgs/BG1_I1"
 #include "bgs/BG1_I2"
@@ -462,10 +462,11 @@ void read_BG(int BG, int *h, int row, int col)
 }

 extern "C"
-void init_LLR_DMA_for_CUDA(t_nrLDPC_dec_params* p_decParams, int8_t* p_llr, int8_t* p_out, int block_length){
+void init_LLR_DMA(t_nrLDPC_dec_params* p_decParams, int8_t* p_llr, int8_t* p_out){
 	
 	uint16_t Zc          = p_decParams->Z;
    uint8_t  BG         = p_decParams->BG;
+    int block_length     = p_decParams->block_length;
 	uint8_t row,col;
 	if(BG == 1){
 		row = 46;
@@ -483,14 +484,60 @@ void init_LLR_DMA_for_CUDA(t_nrLDPC_dec_params* p_decParams, int8_t* p_llr, int8
 	
 }

+using namespace std ;
+
+/* from here: entry points in decoder shared lib */
 extern "C"
-int32_t nrLDPC_decoder_LYC(t_nrLDPC_dec_params* p_decParams, int8_t* p_llr, int8_t* p_out, int block_length, time_stats_t *time_decoder)
-{
+int ldpc_autoinit(void) {   // called by the library loader 
+/*int devices = 0; 
+
+  cudaError_t err = cudaGetDeviceCount(&devices); 
+  AssertFatal(devices>0,"\nNo cuda GPU found\n\n");

+    const int kb = 1024;
+    const int mb = kb * kb;
+    wcout << "NBody.GPU" << endl << "=========" << endl << endl;

+    wcout << "CUDA version:   v" << CUDART_VERSION << endl;    
+    
+
+    wcout << "CUDA Devices: " << endl << endl;
+
+    for(int i = 0; i < devices; ++i)
+    {
+        cudaDeviceProp props;
+        cudaGetDeviceProperties(&props, i);
+        wcout << i << ": " << props.name << ": " << props.major << "." << props.minor << endl;
+        wcout << "  Global memory:   " << props.totalGlobalMem / mb << "mb" << endl;
+        wcout << "  Shared memory:   " << props.sharedMemPerBlock / kb << "kb" << endl;
+        wcout << "  Constant memory: " << props.totalConstMem / kb << "kb" << endl;
+        wcout << "  Block registers: " << props.regsPerBlock << endl << endl;
+
+        wcout << "  Warp size:         " << props.warpSize << endl;
+        wcout << "  Threads per block: " << props.maxThreadsPerBlock << endl;
+        wcout << "  Max block dimensions: [ " << props.maxThreadsDim[0] << ", " << props.maxThreadsDim[1]  << ", " << props.maxThreadsDim[2] << " ]" << endl;
+        wcout << "  Max grid dimensions:  [ " << props.maxGridSize[0] << ", " << props.maxGridSize[1]  << ", " << props.maxGridSize[2] << " ]" << endl;
+        wcout << endl;
+    }
+*/
+  warmup_for_GPU();
+  return 0;  
+}
+
+extern "C"
+void nrLDPC_initcall(t_nrLDPC_dec_params* p_decParams, int8_t* p_llr, int8_t* p_out) {
+	set_compact_BG(p_decParams->Z,p_decParams->BG);
+	init_LLR_DMA(p_decParams, p_llr,  p_out);
+}
+
+
+extern "C"
+int32_t nrLDPC_decod(t_nrLDPC_dec_params* p_decParams, int8_t* p_llr, int8_t* p_out,t_nrLDPC_procBuf* p_procBuf, t_nrLDPC_time_stats *time_decoder)
+{
    uint16_t Zc          = p_decParams->Z;
    uint8_t  BG         = p_decParams->BG;
    uint8_t  numMaxIter = p_decParams->numMaxIter;
+    int block_length    = p_decParams->block_length;
    e_nrLDPC_outMode outMode = p_decParams->outMode;
 	cudaError_t cudaStatus;
 	uint8_t row,col;

--- a/openair1/PHY/CODING/nrLDPC_defs.h
+++ b/openair1/PHY/CODING/nrLDPC_defs.h
@@ -45,6 +45,7 @@ typedef struct {
 	time_stats_t *toutput;
 }encoder_implemparams_t;
 #define INIT0_LDPCIMPLEMPARAMS {0,0,0,NULL,NULL,NULL,NULL}
+typedef void(*nrLDPC_initcallfunc_t)(t_nrLDPC_dec_params* p_decParams, int8_t* p_llr, int8_t* p_out);
 typedef int(*nrLDPC_encoderfunc_t)(unsigned char **,unsigned char **,int,int,short, short, encoder_implemparams_t*);
 //============================================================================================================================
 // decoder interface

--- a/openair1/PHY/CODING/nrLDPC_extern.h
+++ b/openair1/PHY/CODING/nrLDPC_extern.h
@@ -23,11 +23,14 @@
 #ifdef LDPC_LOADER
 nrLDPC_decoderfunc_t nrLDPC_decoder;
 nrLDPC_encoderfunc_t nrLDPC_encoder;
+nrLDPC_initcallfunc_t nrLDPC_initcall;
 #else
 /* functions to load the LDPC shared lib, implemented in openair1/PHY/CODING/nrLDPC_load.c */
-extern int load_nrLDPClib(void) ;
+extern int load_nrLDPClib(char *version) ;
 extern int load_nrLDPClib_ref(char *libversion, nrLDPC_encoderfunc_t * nrLDPC_encoder_ptr); // for ldpctest
 /* ldpc coder/decoder functions, as loaded by load_nrLDPClib(). */
+extern nrLDPC_initcallfunc_t nrLDPC_initcall;
+
 extern nrLDPC_decoderfunc_t nrLDPC_decoder;
 extern nrLDPC_encoderfunc_t nrLDPC_encoder;
 // inline functions:

--- a/openair1/PHY/CODING/nrLDPC_load.c
+++ b/openair1/PHY/CODING/nrLDPC_load.c
@@ -42,22 +42,32 @@


 /* function description array, to be used when loading the encoding/decoding shared lib */
-static loader_shlibfunc_t shlib_fdesc[2];
+static loader_shlibfunc_t shlib_fdesc[3];

-char *arg[64]={"ldpctest","-O","cmdlineonly::dbgl0"};
+/* arguments used when called from phy simulators exec's which do not use the config module */
+/* arg is used to initialize the config module so that the loader works as expected */
+char *arg[64]={"ldpctest","-O","cmdlineonly::dbgl0",NULL,NULL};

-int load_nrLDPClib(void) {
+int load_nrLDPClib(char *version) {
 	 char *ptr = (char*)config_get_if();
+	 char libname[64]="ldpc";
+
     if ( ptr==NULL )  {// phy simulators, config module possibly not loaded
-     	 load_configmodule(3,(char **)arg,CONFIG_ENABLECMDLINEONLY) ;
+     	 load_configmodule(0,(char **)NULL,CONFIG_ENABLECMDLINEONLY) ;
     	 logInit();
     }	 
     shlib_fdesc[0].fname = "nrLDPC_decod";
     shlib_fdesc[1].fname = "nrLDPC_encod";
-     int ret=load_module_shlib("ldpc",shlib_fdesc,sizeof(shlib_fdesc)/sizeof(loader_shlibfunc_t),NULL);
+     shlib_fdesc[2].fname = "nrLDPC_initcall";
+     int ret;
+     if (version)
+       ret=load_module_version_shlib(libname,version,shlib_fdesc,sizeof(shlib_fdesc)/sizeof(loader_shlibfunc_t),NULL);
+     else
+       ret=load_module_shlib(libname,shlib_fdesc,sizeof(shlib_fdesc)/sizeof(loader_shlibfunc_t),NULL);
     AssertFatal( (ret >= 0),"Error loading ldpc decoder");
     nrLDPC_decoder = (nrLDPC_decoderfunc_t)shlib_fdesc[0].fptr;
     nrLDPC_encoder = (nrLDPC_encoderfunc_t)shlib_fdesc[1].fptr;
+     nrLDPC_initcall = (nrLDPC_initcallfunc_t)shlib_fdesc[2].fptr;
 return 0;
 }

@@ -65,10 +75,8 @@ int load_nrLDPClib_ref(char *libversion, nrLDPC_encoderfunc_t * nrLDPC_encoder_p
 	loader_shlibfunc_t shlib_encoder_fdesc;

     shlib_encoder_fdesc.fname = "nrLDPC_encod";
-     char libpath[64];
-     sprintf(libpath,"ldpc%s",libversion);
-     int ret=load_module_shlib(libpath,&shlib_encoder_fdesc,1,NULL);
-     AssertFatal( (ret >= 0),"Error loading ldpc encoder %s\n",libpath);
+     int ret=load_module_version_shlib("ldpc",libversion,&shlib_encoder_fdesc,1,NULL);
+     AssertFatal( (ret >= 0),"Error loading ldpc encoder %s\n",(libversion==NULL)?"":libversion);
     *nrLDPC_encoder_ptr = (nrLDPC_encoderfunc_t)shlib_encoder_fdesc.fptr;
 return 0;
 }

--- a/openair1/PHY/INIT/nr_init.c
+++ b/openair1/PHY/INIT/nr_init.c
@@ -108,7 +108,7 @@ int phy_init_nr_gNB(PHY_VARS_gNB *gNB,
  crcTableInit();
  init_scrambling_luts();
  init_pucch2_luts();
-  load_nrLDPClib();
+  load_nrLDPClib(NULL);
  // PBCH DMRS gold sequences generation
  nr_init_pbch_dmrs(gNB);
  //PDCCH DMRS init

--- a/openair1/PHY/NR_UE_TRANSPORT/nr_dlsch_decoding.c
+++ b/openair1/PHY/NR_UE_TRANSPORT/nr_dlsch_decoding.c
@@ -497,6 +497,8 @@ uint32_t nr_dlsch_decoding(PHY_VARS_NR_UE *phy_vars_ue,
      }

      VCD_SIGNAL_DUMPER_DUMP_FUNCTION_BY_NAME(VCD_SIGNAL_DUMPER_FUNCTIONS_DLSCH_LDPC, VCD_FUNCTION_IN);
+      p_decParams->block_length=length_dec;
+      nrLDPC_initcall(p_decParams, (int8_t*)&pl[0], llrProcBuf);
      no_iteration_ldpc = nrLDPC_decoder(p_decParams,
                                         (int8_t *)&pl[0],
                                         llrProcBuf,
@@ -956,7 +958,8 @@ uint32_t  nr_dlsch_decoding_mthread(PHY_VARS_NR_UE *phy_vars_ue,
    for (i=0, j=0; j < ((kc*harq_process->Z)>>4)+1;  i+=2, j++) {
      pl[j] = _mm_packs_epi16(pv[i],pv[i+1]);
    }
-
+    p_decParams->block_length=length_dec;
+    nrLDPC_initcall(p_decParams, (int8_t*)&pl[0], llrProcBuf);
    no_iteration_ldpc = nrLDPC_decoder(p_decParams,
                                       (int8_t *)&pl[0],
                                       llrProcBuf,
@@ -1340,7 +1343,8 @@ void nr_dlsch_decoding_process(void *arg) {
    for (i=0, j=0; j < ((kc*harq_process->Z)>>4)+1;  i+=2, j++) {
      pl[j] = _mm_packs_epi16(pv[i],pv[i+1]);
    }
-
+    p_decParams->block_length=length_dec;
+    nrLDPC_initcall(p_decParams, (int8_t*)&pl[0], llrProcBuf);
    no_iteration_ldpc = nrLDPC_decoder(p_decParams,
                                       (int8_t *)&pl[0],
                                       llrProcBuf,