diff --git a/cmake_targets/CMakeLists.txt b/cmake_targets/CMakeLists.txt
index d154910828bc37671aa8d0136c39e45fa217b1ac..cf7b1543d39958984016bf528f816c0c8d197f4f 100644
--- a/cmake_targets/CMakeLists.txt
+++ b/cmake_targets/CMakeLists.txt
@@ -309,6 +309,7 @@ endif()
 
 #
 # add autotools definitions that were maybe used!
+
 add_definitions("-DSTDC_HEADERS=1 -DHAVE_SYS_TYPES_H=1 -DHAVE_SYS_STAT_H=1 -DHAVE_STDLIB_H=1 -DHAVE_STRING_H=1 -DHAVE_MEMORY_H=1 -DHAVE_STRINGS_H=1 -DHAVE_INTTYPES_H=1 -DHAVE_STDINT_H=1 -DHAVE_UNISTD_H=1 -DHAVE_FCNTL_H=1 -DHAVE_ARPA_INET_H=1 -DHAVE_SYS_TIME_H=1 -DHAVE_SYS_SOCKET_H=1 -DHAVE_STRERROR=1 -DHAVE_SOCKET=1 -DHAVE_MEMSET=1 -DHAVE_GETTIMEOFDAY=1 -DHAVE_STDLIB_H=1 -DHAVE_MALLOC=1 -DHAVE_LIBSCTP")
 
 set(commonOpts "-pipe -Wno-packed-bitfield-compat -fPIC -Wall -fno-strict-aliasing -rdynamic")
@@ -318,11 +319,7 @@ set(CMAKE_C_FLAGS
 set(CMAKE_CXX_FLAGS
   "${CMAKE_CXX_FLAGS} ${C_FLAGS_PROCESSOR}  ${commonOpts} -std=c++11")
 
-# cuda compiler bug (limitation) on complex macro definition
-if (CUDA_FOUND)
-  set(CMAKE_C_FLAGS "${CMAKE_C_FLAGS} -DCUDA_FLAG")
-  set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} -DCUDA_FLAG")
-endif()
+
 
 if (SANITIZE_ADDRESS)
   set(CMAKE_C_FLAGS "${CMAKE_C_FLAGS} -fsanitize=address -fno-common")
@@ -3123,7 +3120,11 @@ target_link_libraries (nr-uesoftmodem ${LIB_LMS_LIBRARIES})
 target_link_libraries (nr-uesoftmodem ${T_LIB})
 
 add_dependencies( nr-uesoftmodem ldpc_orig ldpc_optim ldpc_optim8seg ldpc )
-
+if (CUDA_FOUND)
+   add_dependencies( nr-uesoftmodem ldpc_cuda)
+   add_dependencies( nr-softmodem ldpc_cuda)
+   add_dependencies( ocp-gnb ldpc_cuda)
+endif (CUDA_FOUND)
 ###################################"
 # Addexecutables for tests
 ####################################
@@ -3183,41 +3184,14 @@ target_link_libraries(smallblocktest
   m pthread ${ATLAS_LIBRARIES} dl
   )
 
-if (CUDA_FOUND)
-###################################################
-# For CUDA library
-###################################################
-
-
-    cuda_add_executable(ldpctest
-      ${OPENAIR1_DIR}/PHY/CODING/TESTBENCH/ldpctest.c
-      ${T_SOURCE}
-      ${SHLIB_LOADER_SOURCES}
-      )
-    target_link_libraries(ldpctest -ldl
-      -Wl,--start-group
-      UTIL SIMU PHY_NR CONFIG_LIB
-      -Wl,--end-group
-      m pthread ${ATLAS_LIBRARIES} dl
-      )
-
-else (CUDA_FOUND)
-    add_executable(ldpctest
-       ${PHY_NR_CODINGIF}
-       ${OPENAIR1_DIR}/PHY/CODING/TESTBENCH/ldpctest.c
-       ${T_SOURCE}
-       ${SHLIB_LOADER_SOURCES}
-       )
-
-endif ()
 
+add_executable(ldpctest
+   ${PHY_NR_CODINGIF}
+   ${OPENAIR1_DIR}/PHY/CODING/TESTBENCH/ldpctest.c
+   ${T_SOURCE}
+   ${SHLIB_LOADER_SOURCES}
+   )
 
-# add_executable(ldpctest
-  # ${PHY_NR_CODINGIF}
-  # ${OPENAIR1_DIR}/PHY/CODING/TESTBENCH/ldpctest.c
-  # ${T_SOURCE}
-  # ${SHLIB_LOADER_SOURCES}
-  # )
 add_dependencies( ldpctest ldpc_orig ldpc_optim ldpc_optim8seg ldpc )
 
 target_link_libraries(ldpctest
diff --git a/openair1/PHY/CODING/nrLDPC_decoder_LYC/nrLDPC_decoder_LYC.cu b/openair1/PHY/CODING/nrLDPC_decoder_LYC/nrLDPC_decoder_LYC.cu
index 931d5003385af8b4144fdbef244ed2176272a90e..42faa1e27a628a276ece456995b2511435149814 100644
--- a/openair1/PHY/CODING/nrLDPC_decoder_LYC/nrLDPC_decoder_LYC.cu
+++ b/openair1/PHY/CODING/nrLDPC_decoder_LYC/nrLDPC_decoder_LYC.cu
@@ -483,6 +483,52 @@ void init_LLR_DMA_for_CUDA(t_nrLDPC_dec_params* p_decParams, int8_t* p_llr, int8
 	
 }
 
+using namespace std ;
+
+/* from here: entry points in decoder shared lib */
+extern "C"
+int ldpc_autoinit(void) {   // called by the library loader 
+int devices = 0; 
+
+  cudaError_t err = cudaGetDeviceCount(&devices); 
+  AssertFatal(devices>0,"\nNo cuda GPU found\n\n");
+
+    const int kb = 1024;
+    const int mb = kb * kb;
+    wcout << "NBody.GPU" << endl << "=========" << endl << endl;
+
+    wcout << "CUDA version:   v" << CUDART_VERSION << endl;    
+    
+
+    wcout << "CUDA Devices: " << endl << endl;
+
+    for(int i = 0; i < devices; ++i)
+    {
+        cudaDeviceProp props;
+        cudaGetDeviceProperties(&props, i);
+        wcout << i << ": " << props.name << ": " << props.major << "." << props.minor << endl;
+        wcout << "  Global memory:   " << props.totalGlobalMem / mb << "mb" << endl;
+        wcout << "  Shared memory:   " << props.sharedMemPerBlock / kb << "kb" << endl;
+        wcout << "  Constant memory: " << props.totalConstMem / kb << "kb" << endl;
+        wcout << "  Block registers: " << props.regsPerBlock << endl << endl;
+
+        wcout << "  Warp size:         " << props.warpSize << endl;
+        wcout << "  Threads per block: " << props.maxThreadsPerBlock << endl;
+        wcout << "  Max block dimensions: [ " << props.maxThreadsDim[0] << ", " << props.maxThreadsDim[1]  << ", " << props.maxThreadsDim[2] << " ]" << endl;
+        wcout << "  Max grid dimensions:  [ " << props.maxGridSize[0] << ", " << props.maxGridSize[1]  << ", " << props.maxGridSize[2] << " ]" << endl;
+        wcout << endl;
+    }
+  warmup_for_GPU();
+  return 0;  
+}
+
+extern "C"
+void nrLDPC_initcall(t_nrLDPC_dec_params* p_decParams, int8_t* p_llr, int8_t* p_out) {
+	set_compact_BG(p_decParams->Z,p_decParams->BG);
+	init_LLR_DMA(p_decParams, p_llr,  p_out);
+}
+
+
 extern "C"
 int32_t nrLDPC_decoder_LYC(t_nrLDPC_dec_params* p_decParams, int8_t* p_llr, int8_t* p_out, int block_length, time_stats_t *time_decoder)
 {
diff --git a/openair1/PHY/CODING/nrLDPC_load.c b/openair1/PHY/CODING/nrLDPC_load.c
index a74bdf7bd12a73c81ac6654f3e34495194e631a0..677c5d53b0b80f844bb1e677e2dff79fe627f9e2 100644
--- a/openair1/PHY/CODING/nrLDPC_load.c
+++ b/openair1/PHY/CODING/nrLDPC_load.c
@@ -42,14 +42,25 @@
 
 
 /* function description array, to be used when loading the encoding/decoding shared lib */
-static loader_shlibfunc_t shlib_fdesc[2];
 
-char *arg[64]={"ldpctest","-O","cmdlineonly::dbgl0"};
+static loader_shlibfunc_t shlib_fdesc[3];
+
+/* arguments used when called from phy simulators exec's which do not use the config module */
+/* arg is used to initialize the config module so that the loader works as expected */
+char *arg[64]={"ldpctest","-O","cmdlineonly::dbgl0",NULL,NULL};
 
 int load_nrLDPClib(void) {
 	 char *ptr = (char*)config_get_if();
+	 char libname[64]="ldpc";
+	 int argc=3;
+	 if (run_cuda) {
+         arg[3]="--loader.ldpc.shlibversion";
+         argc++;
+         arg[4]="_cuda";
+         argc++;
+     }
      if ( ptr==NULL )  {// phy simulators, config module possibly not loaded
-     	 load_configmodule(3,(char **)arg,CONFIG_ENABLECMDLINEONLY) ;
+     	 load_configmodule(argc,(char **)arg,CONFIG_ENABLECMDLINEONLY) ;
      	 logInit();
      }	 
      shlib_fdesc[0].fname = "nrLDPC_decod";
diff --git a/openair1/PHY/NR_UE_TRANSPORT/nr_dlsch_decoding.c b/openair1/PHY/NR_UE_TRANSPORT/nr_dlsch_decoding.c
index 509e42d663be9ee61815b16bb88b02fe5a0c04ca..816b025d34e8b3e220550b70b5b8213675ab19d1 100644
--- a/openair1/PHY/NR_UE_TRANSPORT/nr_dlsch_decoding.c
+++ b/openair1/PHY/NR_UE_TRANSPORT/nr_dlsch_decoding.c
@@ -497,6 +497,7 @@ uint32_t nr_dlsch_decoding(PHY_VARS_NR_UE *phy_vars_ue,
       }
 
       VCD_SIGNAL_DUMPER_DUMP_FUNCTION_BY_NAME(VCD_SIGNAL_DUMPER_FUNCTIONS_DLSCH_LDPC, VCD_FUNCTION_IN);
+      nrLDPC_initcall(p_decParams, (int8_t*)&pl[0], llrProcBuf);
       no_iteration_ldpc = nrLDPC_decoder(p_decParams,
                                          (int8_t *)&pl[0],
                                          llrProcBuf,
@@ -956,7 +957,7 @@ uint32_t  nr_dlsch_decoding_mthread(PHY_VARS_NR_UE *phy_vars_ue,
     for (i=0, j=0; j < ((kc*harq_process->Z)>>4)+1;  i+=2, j++) {
       pl[j] = _mm_packs_epi16(pv[i],pv[i+1]);
     }
-
+    nrLDPC_initcall(p_decParams, (int8_t*)&pl[0], llrProcBuf);
     no_iteration_ldpc = nrLDPC_decoder(p_decParams,
                                        (int8_t *)&pl[0],
                                        llrProcBuf,
@@ -1340,7 +1341,7 @@ void nr_dlsch_decoding_process(void *arg) {
     for (i=0, j=0; j < ((kc*harq_process->Z)>>4)+1;  i+=2, j++) {
       pl[j] = _mm_packs_epi16(pv[i],pv[i+1]);
     }
-
+    nrLDPC_initcall(p_decParams, (int8_t*)&pl[0], llrProcBuf);
     no_iteration_ldpc = nrLDPC_decoder(p_decParams,
                                        (int8_t *)&pl[0],
                                        llrProcBuf,