fix regressions in cuda ldpc

10d7fe9c · laurent · Raymond Knopp · 697a7871 · 10d7fe9c
Commit 10d7fe9c authored Nov 14, 2023 by laurent Committed by Raymond Knopp Nov 28, 2023
Show whitespace changes
Inline Side-by-side

Showing with 433 additions and 468 deletions

openair1/PHY/CODING/nrLDPC_decoder_LYC/nrLDPC_decoder_LYC.cu openair1/PHY/CODING/nrLDPC_decoder_LYC/nrLDPC_decoder_LYC.cu +433 -468

No files found.
--- a/openair1/PHY/CODING/nrLDPC_decoder_LYC/nrLDPC_decoder_LYC.cu
+++ b/openair1/PHY/CODING/nrLDPC_decoder_LYC/nrLDPC_decoder_LYC.cu
- /*! \file PHY/CODING/nrLDPC_decoder_LYC/nrLDPC_decoder_LYC.cu
+/*! \file PHY/CODING/nrLDPC_decoder_LYC/nrLDPC_decoder_LYC.cu
 * \brief LDPC cuda support BG1 all length
 * \author NCTU OpinConnect Terng-Yin Hsu,WEI-YING,LIN
 * \email tyhsu@cs.nctu.edu.tw
@@ -31,15 +31,15 @@
 #include "bgs/BG2_I5"
 #include "bgs/BG2_I6"
 #include "bgs/BG2_I7"
-
 #define MAX_ITERATION 2
 #define MC	1

+typedef void decode_abort_t;
 #define cudaCheck(ans) { cudaAssert((ans), __FILE__, __LINE__); }
 inline void cudaAssert(cudaError_t code, const char *file, int line)
 {
-   if (code != cudaSuccess){
-      fprintf(stderr,"GPUassert: %s %s %d\n", cudaGetErrorString(code), file, line);
+  if (code != cudaSuccess) {
+    fprintf(stderr, "GPUassert: %s %s %d\n", cudaGetErrorString(code), file, line);
    exit(code);
  }
 }
@@ -66,34 +66,16 @@ __device__  h_element dev_h_compact2[68*30];  // used in kernel 2
 // __device__ __constant__ h_element dev_h_compact2[68*30];  // used in kernel 2

 // row and col element count
-__device__ __constant__ char h_ele_row_bg1_count[46] = {
-	19, 19, 19, 19, 3,  8,  9,  7, 10,  9,
-	7,  8,  7,  6,  7,  7,  6,  6,  6,  6,  
-	6,  6,  5,  5,  6,  5,  5,  4,  5,  5,  
-	5,  5,  5,  5,  5,  5,  5,  4,  5,  5,  
-	4,  5,  4,  5,  5,  4};
+__device__ __constant__ char h_ele_row_bg1_count[46] = {19, 19, 19, 19, 3, 8, 9, 7, 10, 9, 7, 8, 7, 6, 7, 7, 6, 6, 6, 6, 6, 6, 5,
+                                                        5,  6,  5,  5,  4, 5, 5, 5, 5,  5, 5, 5, 5, 5, 4, 5, 5, 4, 5, 4, 5, 5, 4};
 __device__ __constant__ char h_ele_col_bg1_count[68] = {
-	30, 28,  7, 11,  9,  4,  8, 12,  8,  7, 
-	12, 10, 12, 11, 10,  7, 10, 10, 13,  7,  
-	8,  11, 12,  5,  6,  6,  1,  1,  1,  1,  
-	1,   1,  1,  1,  1,  1,  1,  1,  1,  1,  
-	1,   1,  1,  1,  1,  1,  1,  1,  1,  1,  
-	1,   1,  1,  1,  1,  1,  1,  1,  1,  1,  
-	1,   1,  1,  1,  1,  1,  1,  1};
-__device__ __constant__ char h_ele_row_bg2_count[42] = { 
-	8, 10,  8, 10,  4,  6,  6,  6,  4,  5, 
-	5,  5,  4,  5,  5,  4,  5,  5,  4,  4,
-	4,  4,  3,  4,  4,  3,  5,  3,  4,  3,
-	5,  3,  4,  4,  4,  4,  4,  3,  4,  4, 
-	4,  4};
-__device__ __constant__ char h_ele_col_bg2_count[52] = {
-	22, 23, 10,  5,  5, 14,  7, 13,  6,  8, 
-	9,  16,  9, 12,  1,  1,  1,  1,  1,  1,
-	1,   1,  1,  1,  1,  1,  1,  1,  1,  1,
-	1,   1,  1,  1,  1,  1,  1,  1,  1,  1,
-	1,   1,  1,  1,  1,  1,  1,  1,  1,  1, 
-	1,   1};
-
+    30, 28, 7, 11, 9, 4, 8, 12, 8, 7, 12, 10, 12, 11, 10, 7, 10, 10, 13, 7, 8, 11, 12, 5, 6, 6, 1, 1, 1, 1, 1, 1, 1, 1,
+    1,  1,  1, 1,  1, 1, 1, 1,  1, 1, 1,  1,  1,  1,  1,  1, 1,  1,  1,  1, 1, 1,  1,  1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1};
+__device__ __constant__ char h_ele_row_bg2_count[42] = {8, 10, 8, 10, 4, 6, 6, 6, 4, 5, 5, 5, 4, 5, 5, 4, 5, 5, 4, 4, 4,
+                                                        4, 3,  4, 4,  3, 5, 3, 4, 3, 5, 3, 4, 4, 4, 4, 4, 3, 4, 4, 4, 4};
+__device__ __constant__ char h_ele_col_bg2_count[52] = {22, 23, 10, 5, 5, 14, 7, 13, 6, 8, 9, 16, 9, 12, 1, 1, 1, 1,
+                                                        1,  1,  1,  1, 1, 1,  1, 1,  1, 1, 1, 1,  1, 1,  1, 1, 1, 1,
+                                                        1,  1,  1,  1, 1, 1,  1, 1,  1, 1, 1, 1,  1, 1,  1, 1};

 __global__ void warmup()
 {
@@ -102,82 +84,78 @@ __global__ void warmup()

 extern "C"
 void warmup_for_GPU(){
-	
-	warmup<<<20,1024 >>>();
-
+  warmup<<<20, 1024>>>();
 }

 extern "C"
 void set_compact_BG(int Zc,short BG){
-	
-	int row,col;
-	if(BG == 1){
+  int row, col;
+  if (BG == 1) {
    row = 46;
    col = 68;
-	}
-	else{
+  } else {
    row = 42;
    col = 52;
  }
  int compact_row = 30;
  int compact_col = 19;
-	if(BG==2){compact_row = 10, compact_col = 23;}
+  if (BG == 2) {
+    compact_row = 10, compact_col = 23;
+  }
  int memorySize_h_compact1 = row * compact_col * sizeof(h_element);
  int memorySize_h_compact2 = compact_row * col * sizeof(h_element);
  int lift_index = 0;
-	short lift_set[][9] = {
-		{2,4,8,16,32,64,128,256},
-		{3,6,12,24,48,96,192,384},
-		{5,10,20,40,80,160,320},
-		{7,14,28,56,112,224},
-		{9,18,36,72,144,288},
-		{11,22,44,88,176,352},
-		{13,26,52,104,208},
-		{15,30,60,120,240},
-		{0}
-	};
-	
-	for(int i = 0; lift_set[i][0] != 0; i++){
-		for(int j = 0; lift_set[i][j] != 0; j++){
-			if(Zc == lift_set[i][j]){
+  short lift_set[][9] = {{2, 4, 8, 16, 32, 64, 128, 256},
+                         {3, 6, 12, 24, 48, 96, 192, 384},
+                         {5, 10, 20, 40, 80, 160, 320},
+                         {7, 14, 28, 56, 112, 224},
+                         {9, 18, 36, 72, 144, 288},
+                         {11, 22, 44, 88, 176, 352},
+                         {13, 26, 52, 104, 208},
+                         {15, 30, 60, 120, 240},
+                         {0}};
+
+  for (int i = 0; lift_set[i][0] != 0; i++) {
+    for (int j = 0; lift_set[i][j] != 0; j++) {
+      if (Zc == lift_set[i][j]) {
        lift_index = i;
        break;
      }
    }
  }
-	printf("\nZc = %d BG = %d\n",Zc,BG);
-	switch(lift_index){
+  printf("\nZc = %d BG = %d\n", Zc, BG);
+  switch (lift_index) {
    case 0:
-				cudaCheck( cudaMemcpyToSymbol(dev_h_compact1, host_h_compact1_I0, memorySize_h_compact1) );
-				cudaCheck( cudaMemcpyToSymbol(dev_h_compact2, host_h_compact2_I0, memorySize_h_compact2) );
+      cudaCheck(cudaMemcpyToSymbol(dev_h_compact1, host_h_compact1_I0, memorySize_h_compact1));
+      cudaCheck(cudaMemcpyToSymbol(dev_h_compact2, host_h_compact2_I0, memorySize_h_compact2));
      break;
    case 1:
-				cudaCheck( cudaMemcpyToSymbol(dev_h_compact1, host_h_compact1_I1, memorySize_h_compact1) );
-				cudaCheck( cudaMemcpyToSymbol(dev_h_compact2, host_h_compact2_I1, memorySize_h_compact2) );
+      cudaCheck(cudaMemcpyToSymbol(dev_h_compact1, host_h_compact1_I1, memorySize_h_compact1));
+      cudaCheck(cudaMemcpyToSymbol(dev_h_compact2, host_h_compact2_I1, memorySize_h_compact2));
      break;
    case 2:
-				cudaCheck( cudaMemcpyToSymbol(dev_h_compact1, host_h_compact1_I2, memorySize_h_compact1) );
-				cudaCheck( cudaMemcpyToSymbol(dev_h_compact2, host_h_compact2_I2, memorySize_h_compact2) );
+      cudaCheck(cudaMemcpyToSymbol(dev_h_compact1, host_h_compact1_I2, memorySize_h_compact1));
+      cudaCheck(cudaMemcpyToSymbol(dev_h_compact2, host_h_compact2_I2, memorySize_h_compact2));
      break;
    case 3:
-				cudaCheck( cudaMemcpyToSymbol(dev_h_compact1, host_h_compact1_I3, memorySize_h_compact1) );
-				cudaCheck( cudaMemcpyToSymbol(dev_h_compact2, host_h_compact2_I3, memorySize_h_compact2) );
+      cudaCheck(cudaMemcpyToSymbol(dev_h_compact1, host_h_compact1_I3, memorySize_h_compact1));
+      cudaCheck(cudaMemcpyToSymbol(dev_h_compact2, host_h_compact2_I3, memorySize_h_compact2));
      break;
    case 4:
-				cudaCheck( cudaMemcpyToSymbol(dev_h_compact1, host_h_compact1_I4, memorySize_h_compact1) );
-				cudaCheck( cudaMemcpyToSymbol(dev_h_compact2, host_h_compact2_I4, memorySize_h_compact2) );
+      cudaCheck(cudaMemcpyToSymbol(dev_h_compact1, host_h_compact1_I4, memorySize_h_compact1));
+      cudaCheck(cudaMemcpyToSymbol(dev_h_compact2, host_h_compact2_I4, memorySize_h_compact2));
      break;
    case 5:
-				cudaCheck( cudaMemcpyToSymbol(dev_h_compact1, host_h_compact1_I5, memorySize_h_compact1) );
-				cudaCheck( cudaMemcpyToSymbol(dev_h_compact2, host_h_compact2_I5, memorySize_h_compact2) );
+      cudaCheck(cudaMemcpyToSymbol(dev_h_compact1, host_h_compact1_I5, memorySize_h_compact1));
+      cudaCheck(cudaMemcpyToSymbol(dev_h_compact2, host_h_compact2_I5, memorySize_h_compact2));
      break;
    case 6:
-				cudaCheck( cudaMemcpyToSymbol(dev_h_compact1, host_h_compact1_I6, memorySize_h_compact1) );
-				cudaCheck( cudaMemcpyToSymbol(dev_h_compact2, host_h_compact2_I6, memorySize_h_compact2) );
+      cudaCheck(cudaMemcpyToSymbol(dev_h_compact1, host_h_compact1_I6, memorySize_h_compact1));
+      cudaCheck(cudaMemcpyToSymbol(dev_h_compact2, host_h_compact2_I6, memorySize_h_compact2));
      break;
    case 7:
-				cudaCheck( cudaMemcpyToSymbol(dev_h_compact1, host_h_compact1_I7, memorySize_h_compact1) );
-				cudaCheck( cudaMemcpyToSymbol(dev_h_compact2, host_h_compact2_I7, memorySize_h_compact2) );
+      cudaCheck(cudaMemcpyToSymbol(dev_h_compact1, host_h_compact1_I7, memorySize_h_compact1));
+      cudaCheck(cudaMemcpyToSymbol(dev_h_compact2, host_h_compact2_I7, memorySize_h_compact2));
      break;
  }

@@ -188,7 +166,7 @@ void set_compact_BG(int Zc,short BG){
 // Kernel 1
 __global__ void ldpc_cnp_kernel_1st_iter(/*char * dev_llr,*/ int BG, int row, int col, int Zc)
 {
-//	if(blockIdx.x == 0 && threadIdx.x == 1) printf("cnp %d\n", threadIdx.x);
+  //	if(blockIdx.x == 0 && threadIdx.x == 1) printf("cnp %d\n", threadIdx.x);
  int iMCW = blockIdx.y; // codeword id
  int iBlkRow = blockIdx.x; // block row in h_base
  int iBlkCol; // block col in h_base
@@ -197,7 +175,7 @@ __global__ void ldpc_cnp_kernel_1st_iter(/*char * dev_llr,*/ int BG, int row, in
  int offsetR;
  int shift_t;

-//	For 2-min algorithm.
+  //	For 2-min algorithm.
  int Q_sign = 0;
  int sq;
  int Q, Q_abs;
@@ -209,55 +187,55 @@ __global__ void ldpc_cnp_kernel_1st_iter(/*char * dev_llr,*/ int BG, int row, in
  char idx_min = 0;

  h_element h_element_t;
-	int s = (BG==1)? h_ele_row_bg1_count[iBlkRow]:h_ele_row_bg2_count[iBlkRow];
-	offsetR = (iMCW * row*col*Zc) + iBlkRow * Zc + iSubRow;	// row*col*Zc = size of dev_dt
-//	if(blockIdx.x == 0 && threadIdx.x == 1) printf("s: %d, offset %d\n", s, offsetR);
-//	The 1st recursion
-	for(int i = 0; i < s; i++) // loop through all the ZxZ sub-blocks in a row
+  int s = (BG == 1) ? h_ele_row_bg1_count[iBlkRow] : h_ele_row_bg2_count[iBlkRow];
+  offsetR = (iMCW * row * col * Zc) + iBlkRow * Zc + iSubRow; // row*col*Zc = size of dev_dt
+  //	if(blockIdx.x == 0 && threadIdx.x == 1) printf("s: %d, offset %d\n", s, offsetR);
+  //	The 1st recursion
+  for (int i = 0; i < s; i++) // loop through all the ZxZ sub-blocks in a row
  {
-		h_element_t = dev_h_compact1[i*row+iBlkRow];	// compact_col == row
+    h_element_t = dev_h_compact1[i * row + iBlkRow]; // compact_col == row

    iBlkCol = h_element_t.y;
    shift_t = h_element_t.value;

    shift_t = (iSubRow + shift_t) % Zc;
-		iCol = (iMCW * col*Zc) + iBlkCol * Zc + shift_t;	// col*Zc = size of llr
+    iCol = (iMCW * col * Zc) + iBlkCol * Zc + shift_t; // col*Zc = size of llr
    Q = dev_llr[iCol];
-		Q_abs = (Q>0)? Q : -Q;
+    Q_abs = (Q > 0) ? Q : -Q;
    sq = Q < 0;
-//		if(blockIdx.x == 0 && threadIdx.x == 1) printf("i %d, icol %d, Q: %d\n", i, iCol, Q);
+    //		if(blockIdx.x == 0 && threadIdx.x == 1) printf("i %d, icol %d, Q: %d\n", i, iCol, Q);
    // quick version
    sign = sign * (1 - sq * 2);
    Q_sign |= sq << i;

-		if (Q_abs < rmin1){
+    if (Q_abs < rmin1) {
      rmin2 = rmin1;
      rmin1 = Q_abs;
      idx_min = i;
-		} else if (Q_abs < rmin2){
+    } else if (Q_abs < rmin2) {
      rmin2 = Q_abs;
    }
  }

-//	if(blockIdx.x == 0 && threadIdx.x == 1)printf("min1 %d, min2 %d, min1_idx %d\n", rmin1, rmin2, idx_min);
+  //	if(blockIdx.x == 0 && threadIdx.x == 1)printf("min1 %d, min2 %d, min1_idx %d\n", rmin1, rmin2, idx_min);

-//	The 2nd recursion
-	for(int i = 0; i < s; i++){
+  //	The 2nd recursion
+  for (int i = 0; i < s; i++) {
    // v0: Best performance so far. 0.75f is the value of alpha.
    sq = 1 - 2 * ((Q_sign >> i) & 0x01);
    R_temp = 0.75f * sign * sq * (i != idx_min ? rmin1 : rmin2);
    // write results to global memory
-		h_element_t = dev_h_compact1[i*row+iBlkRow];
+    h_element_t = dev_h_compact1[i * row + iBlkRow];
    int addr_temp = offsetR + h_element_t.y * row * Zc;
    dev_dt[addr_temp] = R_temp;
-//		if(blockIdx.x == 0 && threadIdx.x == 1)printf("R_temp %d, temp_addr %d\n", R_temp, addr_temp);
+    //		if(blockIdx.x == 0 && threadIdx.x == 1)printf("R_temp %d, temp_addr %d\n", R_temp, addr_temp);
  }
 }

 // Kernel_1
 __global__ void ldpc_cnp_kernel(/*char * dev_llr, char * dev_dt,*/ int BG, int row, int col, int Zc)
 {
-//	if(blockIdx.x == 0 && threadIdx.x == 1) printf("cnp\n");
+  //	if(blockIdx.x == 0 && threadIdx.x == 1) printf("cnp\n");
  int iMCW = blockIdx.y;
  int iBlkRow = blockIdx.x; // block row in h_base
  int iBlkCol; // block col in h_base
@@ -266,7 +244,7 @@ __global__ void ldpc_cnp_kernel(/*char * dev_llr, char * dev_dt,*/ int BG, int r
  int offsetR;
  int shift_t;

-//	For 2-min algorithm.
+  //	For 2-min algorithm.
  int Q_sign = 0;
  int sq;
  int Q, Q_abs;
@@ -278,13 +256,13 @@ __global__ void ldpc_cnp_kernel(/*char * dev_llr, char * dev_dt,*/ int BG, int r
  char idx_min = 0;

  h_element h_element_t;
-	int s = (BG==1)? h_ele_row_bg1_count[iBlkRow]: h_ele_row_bg2_count[iBlkRow];
-	offsetR = (iMCW *row*col*Zc) + iBlkRow * Zc + iSubRow;	// row * col * Zc = size of dev_dt
-//	if(blockIdx.x == 0 && threadIdx.x == 1) printf("s: %d, offset %d\n", s, offsetR);
-//	The 1st recursion
-	for(int i = 0; i < s; i++) // loop through all the ZxZ sub-blocks in a row
+  int s = (BG == 1) ? h_ele_row_bg1_count[iBlkRow] : h_ele_row_bg2_count[iBlkRow];
+  offsetR = (iMCW * row * col * Zc) + iBlkRow * Zc + iSubRow; // row * col * Zc = size of dev_dt
+  //	if(blockIdx.x == 0 && threadIdx.x == 1) printf("s: %d, offset %d\n", s, offsetR);
+  //	The 1st recursion
+  for (int i = 0; i < s; i++) // loop through all the ZxZ sub-blocks in a row
  {
-		h_element_t = dev_h_compact1[i*row+iBlkRow];
+    h_element_t = dev_h_compact1[i * row + iBlkRow];

    iBlkCol = h_element_t.y;
    shift_t = h_element_t.value;
@@ -293,36 +271,34 @@ __global__ void ldpc_cnp_kernel(/*char * dev_llr, char * dev_dt,*/ int BG, int r

    R_temp = dev_dt[offsetR + iBlkCol * row * Zc];

-		Q = dev_llr[iMCW * (col*Zc) + iCol] - R_temp;
-		Q_abs = (Q>0)? Q : -Q;
-//		if(blockIdx.x == 0 && threadIdx.x == 1) printf("i %d, icol %d, Q: %d\n", i, iCol, Q);
+    Q = dev_llr[iMCW * (col * Zc) + iCol] - R_temp;
+    Q_abs = (Q > 0) ? Q : -Q;
+    //		if(blockIdx.x == 0 && threadIdx.x == 1) printf("i %d, icol %d, Q: %d\n", i, iCol, Q);
    sq = Q < 0;
    sign = sign * (1 - sq * 2);
    Q_sign |= sq << i;

-		if (Q_abs < rmin1){
+    if (Q_abs < rmin1) {
      rmin2 = rmin1;
      rmin1 = Q_abs;
      idx_min = i;
-		} else if (Q_abs < rmin2){
+    } else if (Q_abs < rmin2) {
      rmin2 = Q_abs;
    }
-		
  }

-//	if(blockIdx.x == 0 && threadIdx.x == 1)printf("min1 %d, min2 %d, min1_idx %d\n", rmin1, rmin2, idx_min);
+  //	if(blockIdx.x == 0 && threadIdx.x == 1)printf("min1 %d, min2 %d, min1_idx %d\n", rmin1, rmin2, idx_min);

-//	The 2nd recursion
-	for(int i = 0; i < s; i ++){
+  //	The 2nd recursion
+  for (int i = 0; i < s; i++) {
    sq = 1 - 2 * ((Q_sign >> i) & 0x01);
    R_temp = 0.75f * sign * sq * (i != idx_min ? rmin1 : rmin2);

-
    // write results to global memory
-		h_element_t = dev_h_compact1[i*row+iBlkRow];
+    h_element_t = dev_h_compact1[i * row + iBlkRow];
    int addr_temp = h_element_t.y * row * Zc + offsetR;
    dev_dt[addr_temp] = R_temp;
-//		if(blockIdx.x == 0 && threadIdx.x == 1)printf("R_temp %d, temp_addr %d\n", R_temp, addr_temp);
+    //		if(blockIdx.x == 0 && threadIdx.x == 1)printf("R_temp %d, temp_addr %d\n", R_temp, addr_temp);
  }
 }

@@ -344,15 +320,14 @@ ldpc_vnp_kernel_normal(/*char * dev_llr, char * dev_dt, char * dev_const_llr,*/

  // update all the llr values
  iCol = iBlkCol * Zc + iSubCol;
-	APP = dev_const_llr[iMCW *col*Zc + iCol];
-	int offsetDt = iMCW *row*col*Zc + iBlkCol * row * Zc;
-	int s = (BG==1)? h_ele_col_bg1_count[iBlkCol]:h_ele_col_bg2_count[iBlkCol];
+  APP = dev_const_llr[iMCW * col * Zc + iCol];
+  int offsetDt = iMCW * row * col * Zc + iBlkCol * row * Zc;
+  int s = (BG == 1) ? h_ele_col_bg1_count[iBlkCol] : h_ele_col_bg2_count[iBlkCol];

-	for(int i = 0; i < s; i++)
-	{
-		h_element_t = dev_h_compact2[i*col+iBlkCol];
+  for (int i = 0; i < s; i++) {
+    h_element_t = dev_h_compact2[i * col + iBlkCol];

-		shift_t = h_element_t.value%Zc;
+    shift_t = h_element_t.value % Zc;
    iBlkRow = h_element_t.x;

    sf = iSubCol - shift_t;
@@ -361,10 +336,12 @@ ldpc_vnp_kernel_normal(/*char * dev_llr, char * dev_dt, char * dev_const_llr,*/
    iRow = iBlkRow * Zc + sf;
    APP = APP + dev_dt[offsetDt + iRow];
  }
-	if(APP > SCHAR_MAX)	APP = SCHAR_MAX;
-	if(APP < SCHAR_MIN)	APP = SCHAR_MIN;
+  if (APP > SCHAR_MAX)
+    APP = SCHAR_MAX;
+  if (APP < SCHAR_MIN)
+    APP = SCHAR_MIN;
  // write back to device global memory
-	dev_llr[iMCW *col*Zc + iCol] = APP;
+  dev_llr[iMCW * col * Zc + iCol] = APP;
 }


@@ -372,19 +349,19 @@ __global__ void pack_decoded_bit(/*char *dev, unsigned char *host,*/ int col, in
 {
  __shared__ unsigned char tmp[128];
  int iMCW = blockIdx.y;
-	int tid = iMCW * col*Zc + blockIdx.x*128 + threadIdx.x;
+  int tid = iMCW * col * Zc + blockIdx.x * 128 + threadIdx.x;
  int btid = threadIdx.x;
  tmp[btid] = 0;

-	if(dev_llr[tid] < 0){
-		tmp[btid] = 1 << (7-(btid&7));
+  if (dev_llr[tid] < 0) {
+    tmp[btid] = 1 << (7 - (btid & 7));
  }
  __syncthreads();

-	if(threadIdx.x < 16){
-		dev_tmp[iMCW * col*Zc + blockIdx.x*16+threadIdx.x] = 0;
-		for(int i = 0; i < 8; i++){
-			dev_tmp[iMCW * col*Zc + blockIdx.x*16+threadIdx.x] += tmp[threadIdx.x*8+i];
+  if (threadIdx.x < 16) {
+    dev_tmp[iMCW * col * Zc + blockIdx.x * 16 + threadIdx.x] = 0;
+    for (int i = 0; i < 8; i++) {
+      dev_tmp[iMCW * col * Zc + blockIdx.x * 16 + threadIdx.x] += tmp[threadIdx.x * 8 + i];
    }
  }
 }
@@ -392,28 +369,30 @@ __global__ void pack_decoded_bit(/*char *dev, unsigned char *host,*/ int col, in
 void read_BG(int BG, int *h, int row, int col)
 {
  int compact_row = 30, compact_col = 19;
-	if(BG==2){compact_row = 10, compact_col = 23;}
+  if (BG == 2) {
+    compact_row = 10, compact_col = 23;
+  }

  h_element h_element_temp;

  // init the compact matrix
-	for(int i = 0; i < compact_col; i++){
-		for(int j = 0; j < row; j++){
+  for (int i = 0; i < compact_col; i++) {
+    for (int j = 0; j < row; j++) {
      h_element_temp.x = 0;
      h_element_temp.y = 0;
      h_element_temp.value = -1;
-			h_compact1[i*row+j] = h_element_temp; // h[i][0-11], the same column
+      h_compact1[i * row + j] = h_element_temp; // h[i][0-11], the same column
    }
  }
  // scan the h matrix, and gengerate compact mode of h
-	for(int i = 0; i < row; i++){
+  for (int i = 0; i < row; i++) {
    int k = 0;
-		for(int j = 0; j < col; j++){
-			if(h[i*col+j] != -1){
+    for (int j = 0; j < col; j++) {
+      if (h[i * col + j] != -1) {
        h_element_temp.x = i;
        h_element_temp.y = j;
-				h_element_temp.value = h[i*col+j];
-				h_compact1[k*row+i] = h_element_temp;
+        h_element_temp.value = h[i * col + j];
+        h_compact1[k * row + i] = h_element_temp;
        k++;
      }
    }
@@ -421,24 +400,24 @@ void read_BG(int BG, int *h, int row, int col)

  // h_compact2
  // init the compact matrix
-	for(int i = 0; i < compact_row; i++){
-		for(int j = 0; j < col; j++){
+  for (int i = 0; i < compact_row; i++) {
+    for (int j = 0; j < col; j++) {
      h_element_temp.x = 0;
      h_element_temp.y = 0;
      h_element_temp.value = -1;
-			h_compact2[i*col+j] = h_element_temp;
+      h_compact2[i * col + j] = h_element_temp;
    }
  }

-	for(int j = 0; j < col; j++){
-		int k=0;
-		for(int i = 0; i < row; i++){
-			if(h[i*col+j] != -1){
+  for (int j = 0; j < col; j++) {
+    int k = 0;
+    for (int i = 0; i < row; i++) {
+      if (h[i * col + j] != -1) {
        // although h is transposed, the (x,y) is still (iBlkRow, iBlkCol)
        h_element_temp.x = i;
        h_element_temp.y = j;
-				h_element_temp.value = h[i*col+j];
-				h_compact2[k*col+j] = h_element_temp;
+        h_element_temp.value = h[i * col + j];
+        h_compact2[k * col + j] = h_element_temp;
        k++;
      }
    }
@@ -463,25 +442,18 @@ void read_BG(int BG, int *h, int row, int col)

 extern "C"
 void init_LLR_DMA(t_nrLDPC_dec_params* p_decParams, int8_t* p_llr, int8_t* p_out){
-	
  uint16_t Zc = p_decParams->Z;
  uint8_t BG = p_decParams->BG;
-    int block_length     = p_decParams->block_length;
-	uint8_t row,col;
-	if(BG == 1){
-		row = 46;
+  uint8_t col;
+  if (BG == 1) {
    col = 68;
-	}
-	else{
-		row = 42;
+  } else {
    col = 52;
  }
-	unsigned char *hard_decision = (unsigned char*)p_out;
  int memorySize_llr_cuda = col * Zc * sizeof(char) * MC;
-	cudaCheck( cudaMemcpyToSymbol(dev_const_llr, p_llr, memorySize_llr_cuda) );
-	cudaCheck( cudaMemcpyToSymbol(dev_llr, p_llr, memorySize_llr_cuda) );
+  cudaCheck(cudaMemcpyToSymbol(dev_const_llr, p_llr, memorySize_llr_cuda));
+  cudaCheck(cudaMemcpyToSymbol(dev_llr, p_llr, memorySize_llr_cuda));
  cudaDeviceSynchronize();
-	
 }

 using namespace std ;
@@ -489,7 +461,7 @@ using namespace std ;
 /* from here: entry points in decoder shared lib */
 extern "C"
 int ldpc_autoinit(void) {   // called by the library loader
-/*int devices = 0; 
+  /*int devices = 0;

    cudaError_t err = cudaGetDeviceCount(&devices);
    AssertFatal(devices>0,"\nNo cuda GPU found\n\n");
@@ -515,55 +487,55 @@ int ldpc_autoinit(void) {   // called by the library loader

    wcout << "  Warp size:         " << props.warpSize << endl;
    wcout << "  Threads per block: " << props.maxThreadsPerBlock << endl;
-        wcout << "  Max block dimensions: [ " << props.maxThreadsDim[0] << ", " << props.maxThreadsDim[1]  << ", " << props.maxThreadsDim[2] << " ]" << endl;
-        wcout << "  Max grid dimensions:  [ " << props.maxGridSize[0] << ", " << props.maxGridSize[1]  << ", " << props.maxGridSize[2] << " ]" << endl;
-        wcout << endl;
+    wcout << "  Max block dimensions: [ " << props.maxThreadsDim[0] << ", " << props.maxThreadsDim[1]  << ", " <<
+    props.maxThreadsDim[2] << " ]" << endl; wcout << "  Max grid dimensions:  [ " << props.maxGridSize[0] << ", " <<
+    props.maxGridSize[1]  << ", " << props.maxGridSize[2] << " ]" << endl; wcout << endl;
    }
-*/
+  */
  warmup_for_GPU();
  return 0;  
 }

-extern "C" void LDPCinit(t_nrLDPC_dec_params* p_decParams, int8_t* p_llr, int8_t* p_out)
+extern "C" int32_t LDPCinit()
 {
-  set_compact_BG(p_decParams->Z, p_decParams->BG);
-  init_LLR_DMA(p_decParams, p_llr, p_out);
+  return 0;
 }

 extern "C" void LDPCshutdown()
 {
 }

-extern "C" int32_t LDPCdecoder(t_nrLDPC_dec_params* p_decParams,
-                               int8_t* p_llr,
-                               int8_t* p_out,
-                               t_nrLDPC_procBuf* p_procBuf,
-                               t_nrLDPC_time_stats* time_decoder)
+extern "C"  int32_t LDPCdecoder(t_nrLDPC_dec_params *p_decParams,
+                                    uint8_t harq_pid,
+                                    uint8_t ulsch_id,
+                                    uint8_t C,
+                                    int8_t *p_llr,
+                                    int8_t *p_out,
+                                    t_nrLDPC_time_stats *,
+                                    decode_abort_t *ab)
 {
+  set_compact_BG(p_decParams->Z, p_decParams->BG);
+  init_LLR_DMA(p_decParams, p_llr, p_out);
  uint16_t Zc = p_decParams->Z;
  uint8_t BG = p_decParams->BG;
-    uint8_t  numMaxIter = p_decParams->numMaxIter;
  int block_length = p_decParams->block_length;
-    e_nrLDPC_outMode outMode = p_decParams->outMode;
-	cudaError_t cudaStatus;
-	uint8_t row,col;
-	if(BG == 1){
+  uint8_t row, col;
+  if (BG == 1) {
    row = 46;
    col = 68;
-	}
-	else{
+  } else {
    row = 42;
    col = 52;
  }

-//	alloc memory
-	unsigned char *hard_decision = (unsigned char*)p_out;
-//	gpu
+  //	alloc memory
+  unsigned char* hard_decision = (unsigned char*)p_out;
+  //	gpu
  int memorySize_llr_cuda = col * Zc * sizeof(char) * MC;
-	cudaCheck( cudaMemcpyToSymbol(dev_const_llr, p_llr, memorySize_llr_cuda) );
-	cudaCheck( cudaMemcpyToSymbol(dev_llr, p_llr, memorySize_llr_cuda) );
+  cudaCheck(cudaMemcpyToSymbol(dev_const_llr, p_llr, memorySize_llr_cuda));
+  cudaCheck(cudaMemcpyToSymbol(dev_llr, p_llr, memorySize_llr_cuda));

-// Define CUDA kernel dimension
+  // Define CUDA kernel dimension
  int blockSizeX = Zc;
  dim3 dimGridKernel1(row, MC, 1); // dim of the thread blocks
  dim3 dimBlockKernel1(blockSizeX, 1, 1);
@@ -572,34 +544,27 @@ extern "C" int32_t LDPCdecoder(t_nrLDPC_dec_params* p_decParams,
  dim3 dimBlockKernel2(blockSizeX, 1, 1);
  cudaDeviceSynchronize();

-// lauch kernel 
+  // lauch kernel

-	for(int ii = 0; ii < MAX_ITERATION; ii++){
+  for (int ii = 0; ii < MAX_ITERATION; ii++) {
    // first kernel
-		if(ii == 0){
-			ldpc_cnp_kernel_1st_iter 
-			<<<dimGridKernel1, dimBlockKernel1>>>
-			(/*dev_llr,*/ BG, row, col, Zc);
-		}else{
-			ldpc_cnp_kernel
-			<<<dimGridKernel1, dimBlockKernel1>>>
-			(/*dev_llr,*/ BG, row, col, Zc);
+    if (ii == 0) {
+      ldpc_cnp_kernel_1st_iter<<<dimGridKernel1, dimBlockKernel1>>>(/*dev_llr,*/ BG, row, col, Zc);
+    } else {
+      ldpc_cnp_kernel<<<dimGridKernel1, dimBlockKernel1>>>(/*dev_llr,*/ BG, row, col, Zc);
    }
    // second kernel
-		ldpc_vnp_kernel_normal
-		<<<dimGridKernel2, dimBlockKernel2>>>
+    ldpc_vnp_kernel_normal<<<dimGridKernel2, dimBlockKernel2>>>
        // (dev_llr, dev_const_llr,BG, row, col, Zc);
        (BG, row, col, Zc);
  }

-	int pack = (block_length/128)+1;
+  int pack = (block_length / 128) + 1;
  dim3 pack_block(pack, MC, 1);
-	pack_decoded_bit<<<pack_block,128>>>(/*dev_llr,*/ /*dev_tmp,*/ col, Zc);
+  pack_decoded_bit<<<pack_block, 128>>>(/*dev_llr,*/ /*dev_tmp,*/ col, Zc);

-	cudaCheck( cudaMemcpyFromSymbol((void*)hard_decision, (const void*)dev_tmp, (block_length/8)*sizeof(unsigned char)) );
+  cudaCheck(cudaMemcpyFromSymbol((void*)hard_decision, (const void*)dev_tmp, (block_length / 8) * sizeof(unsigned char)));
  cudaDeviceSynchronize();

-
  return MAX_ITERATION;
-	
 }