opencl implementation of ldpc decoder (platform query)

4345d42e · frtabu · d31ac580 · 4345d42e · 4345d42e
Commit 4345d42e authored Oct 05, 2021 by frtabu
2 changed files
--- a/openair1/PHY/CODING/nrLDPC_decoder/nrLDPC_decoder_CL.c
+++ b/openair1/PHY/CODING/nrLDPC_decoder/nrLDPC_decoder_CL.c
@@ -40,7 +40,7 @@
 #define MC	1

 #define MAX_OCLDEV   10
-
+#define MAX_OCLRUNTIME 5
 typedef struct{
  char x;
  char y;
@@ -51,14 +51,20 @@ typedef struct{
  cl_uint max_CU;
  cl_uint max_WID;
  size_t  *max_WIS;
+} ocldev_t;
+
+typedef struct{
+  cl_uint num_devices;
+  cl_device_id devices[MAX_OCLDEV];
+  ocldev_t ocldev[MAX_OCLDEV];
  cl_context context;
  cl_program program;
  cl_kernel kernel;
-  cl_command_queue queue;
-} ocldev_t;
+  cl_command_queue queue[MAX_OCLDEV];
+} oclruntime_t;

 typedef struct{
-  ocldev_t ocldev[MAX_OCLDEV];
+  oclruntime_t runtime[MAX_OCLRUNTIME];
 } ocl_t;


@@ -86,7 +92,8 @@ void init_LLR_DMA(t_nrLDPC_dec_params* p_decParams, int8_t* p_llr, int8_t* p_out
 //	cudaDeviceSynchronize();
 	
 }
-cl_error_callback(const char* errinfo, const void* private_info, size_t cb, void* user_data) {
+void cl_error_callback(const char* errinfo, const void* private_info, size_t cb, void* user_data) {
+  oclruntime_t *runtime = (oclruntime_t *)user_data;
  LOG_E(HW,"OpenCL accelerator error  %s\n", errinfo );
 }

@@ -103,11 +110,25 @@ char *clutil_getstrdev(int intdev) {
  return retstring;
 }

+size_t load_source(char **source_str) {
+	int MAX_SOURCE_SIZE=(500*132);
+    FILE *fp;
+    size_t source_size;
+ 
+    fp = fopen("nrLDPC_decoder_kernels_CL.cl", "r");
+    AssertFatal(fp,"failed to open cl source: %s\n",strerror(errno));
+    
+    *source_str = (char*)malloc(MAX_SOURCE_SIZE);
+    source_size = fread( *source_str, 1, MAX_SOURCE_SIZE, fp);
+    fclose( fp );
+    return source_size;
+}
+
 /* from here: entry points in decoder shared lib */
 int ldpc_autoinit(void) {   // called by the library loader 
  cl_platform_id platforms[10];
  cl_uint         num_platforms_found;
-  
+  int             context_ok=0;
  cl_uint rt = clGetPlatformIDs( sizeof(platforms)/sizeof(cl_platform_id), platforms, &num_platforms_found );
  AssertFatal(rt == CL_SUCCESS, "clGetPlatformIDs error %d\n" , (int)rt);
  AssertFatal( num_platforms_found>0 , "clGetPlatformIDs: no cl compatible platform found\n");
@@ -119,33 +140,50 @@ int ldpc_autoinit(void) {   // called by the library loader
 	  rt = clGetPlatformInfo(platforms[i],CL_PLATFORM_VERSION, sizeof(stringval),stringval,NULL);	
 	  AssertFatal(rt == CL_SUCCESS, "clGetPlatformInfo VERSION error %d\n" , (int)rt);  
 	  LOG_I(HW,"Platform %i, OpenCL version %s\n", i,stringval );	  
-	  cl_device_id devices[20];
-	  cl_uint       num_devices_found;
-	  rt = clGetDeviceIDs(platforms[i],CL_DEVICE_TYPE_ALL, sizeof(devices)/sizeof(cl_device_id),devices,&num_devices_found);
+	  rt = clGetDeviceIDs(platforms[i],CL_DEVICE_TYPE_ALL, sizeof(ocl.runtime[i].devices)/sizeof(cl_device_id),ocl.runtime[i].devices,&(ocl.runtime[i].num_devices));
 	  AssertFatal(rt == CL_SUCCESS, "clGetDeviceIDs error %d\n" , (int)rt);
-	  for (int j=0; j<num_devices_found; j++) {
+	  int devok=0;
+	  for (int j=0; j<ocl.runtime[i].num_devices; j++) {
 		cl_bool abool;
-		rt = clGetDeviceInfo(devices[j],CL_DEVICE_AVAILABLE, sizeof(abool),&abool,NULL);
+		rt = clGetDeviceInfo(ocl.runtime[i].devices[j],CL_DEVICE_AVAILABLE, sizeof(abool),&abool,NULL);
 		AssertFatal(rt == CL_SUCCESS, "clGetDeviceInfo DEVICE_AVAILABLE error %d\n" , (int)rt); 
 		LOG_I(HW,"Device %i is %s available\n", j, (abool==CL_TRUE?"":"not"));
 		cl_device_type devtype;		
-		rt = clGetDeviceInfo(devices[j],CL_DEVICE_TYPE, sizeof(cl_device_type),&devtype,NULL);
+		rt = clGetDeviceInfo(ocl.runtime[i].devices[j],CL_DEVICE_TYPE, sizeof(cl_device_type),&devtype,NULL);
 		AssertFatal(rt == CL_SUCCESS, "clGetDeviceInfo DEVICE_TYPE error %d\n" , (int)rt); 
 		LOG_I(HW,"Device %i, type %d = %s\n", j,(int)devtype, clutil_getstrdev(devtype));
-		rt = clGetDeviceInfo(devices[j],CL_DEVICE_MAX_COMPUTE_UNITS,sizeof(ocl.ocldev[j].max_CU),&(ocl.ocldev[j].max_CU),NULL);
+		rt = clGetDeviceInfo(ocl.runtime[i].devices[j],CL_DEVICE_MAX_COMPUTE_UNITS,sizeof(ocl.runtime[i].ocldev[j].max_CU),&(ocl.runtime[i].ocldev[j].max_CU),NULL);
 		AssertFatal(rt == CL_SUCCESS, "clGetDeviceInfo MAX_COMPUTE_UNITS error %d\n" , (int)rt);
-		LOG_I(HW,"Device %i, number of Compute Units: %d\n", j,ocl.ocldev[j].max_CU);
-		rt = clGetDeviceInfo(devices[j],CL_DEVICE_MAX_WORK_ITEM_DIMENSIONS,sizeof(ocl.ocldev[j].max_WID),&(ocl.ocldev[j].max_WID),NULL);
+		LOG_I(HW,"Device %i, number of Compute Units: %d\n", j,ocl.runtime[i].ocldev[j].max_CU);
+		rt = clGetDeviceInfo(ocl.runtime[i].devices[j],CL_DEVICE_MAX_WORK_ITEM_DIMENSIONS,sizeof(ocl.runtime[i].ocldev[j].max_WID),&(ocl.runtime[i].ocldev[j].max_WID),NULL);
 		AssertFatal(rt == CL_SUCCESS, "clGetDeviceInfo MAX_WORK_ITEM_DIMENSIONS error %d\n" , (int)rt);
-		LOG_I(HW,"Device %i, max Work Items dimension: %d\n", j,ocl.ocldev[j].max_WID);	
-		ocl.ocldev[j].max_WIS = (size_t *)malloc(ocl.ocldev[j].max_WID * sizeof(size_t));
-		rt = clGetDeviceInfo(devices[j],CL_DEVICE_MAX_WORK_ITEM_SIZES,sizeof(ocl.ocldev[j].max_WID)*sizeof(size_t),ocl.ocldev[j].max_WIS,NULL);
+		LOG_I(HW,"Device %i, max Work Items dimension: %d\n", j,ocl.runtime[i].ocldev[j].max_WID);	
+		ocl.runtime[i].ocldev[j].max_WIS = (size_t *)malloc(ocl.runtime[i].ocldev[j].max_WID * sizeof(size_t));
+		rt = clGetDeviceInfo(ocl.runtime[i].devices[j],CL_DEVICE_MAX_WORK_ITEM_SIZES,sizeof(ocl.runtime[i].ocldev[j].max_WID)*sizeof(size_t),ocl.runtime[i].ocldev[j].max_WIS,NULL);
 		AssertFatal(rt == CL_SUCCESS, "clGetDeviceInfo MAX_WORK_ITEM_SIZES error %d\n" , (int)rt);
-		for(int k=0; k<ocl.ocldev[j].max_WID;k++)
-		  LOG_I(HW,"Device %i, max Work Items size for dimension: %d %u\n", j,k,(uint32_t)ocl.ocldev[j].max_WIS[k]);
-		
+		for(int k=0; k<ocl.runtime[i].ocldev[j].max_WID;k++)
+		  LOG_I(HW,"Device %i, max Work Items size for dimension: %d %u\n", j,k,(uint32_t)ocl.runtime[i].ocldev[j].max_WIS[k]); 
+		devok++;
+      }
+      if (devok >0) {
+        ocl.runtime[i].context = clCreateContext(NULL, ocl.runtime[i].num_devices, ocl.runtime[i].devices, cl_error_callback, &(ocl.runtime[i]), (cl_int *)&rt); 
+        AssertFatal(rt == CL_SUCCESS, "Error %d creating context for platform %i\n" , (int)rt, i);
+        for(int dev=0; dev<ocl.runtime[i].num_devices; dev++) {
+          ocl.runtime[i].queue[dev] = clCreateCommandQueueWithProperties(ocl.runtime[i].context,ocl.runtime[i].devices[dev] , 0, (cl_int *)&rt);
+          AssertFatal(rt == CL_SUCCESS, "Error %d creating command queue for platform %i device %i\n" , (int)rt, i,dev);
+        }
+        char *source_str;
+        size_t source_size=load_source(&source_str);
+        cl_program program = clCreateProgramWithSource(ocl.runtime[i].context, 1, 
+                                                       (const char **)&source_str, (const size_t *)&source_size,  (cl_int *)&rt);
+        AssertFatal(rt == CL_SUCCESS, "Error %d creating program for platform %i \n" , (int)rt, i); 
+        rt = clBuildProgram(program, ocl.runtime[i].num_devices,ocl.runtime[i].devices, NULL, NULL, NULL);   
+        AssertFatal(rt == CL_SUCCESS, "Error %d buildding program for platform %i \n" , rt, i);                                            
+        context_ok++;
      }
+      devok=0;
  }
+  AssertFatal(context_ok>0, "No openCL device available to accelerate ldpc\n"); 
  return 0;  
 }


--- a/openair1/PHY/CODING/nrLDPC_decoder/nrLDPC_decoder_kernels_CL.cl
+++ b/openair1/PHY/CODING/nrLDPC_decoder/nrLDPC_decoder_kernels_CL.cl
+
+define MAX_ITERATION 2
+#define MC	1
+
+
+
+typedef struct{
+  char x;
+  char y;
+  short value;
+} h_element;
+
+char dev_const_llr[68*384];
+char dev_dt [46*68*384];
+char dev_llr[68*384];
+unsigned char dev_tmp[68*384];
+
+h_element h_compact1 [46*19] = {};
+h_element h_compact2 [68*30] = {};
+
+h_element dev_h_compact1[46*19];  // used in kernel 1
+h_element dev_h_compact2[68*30];  // used in kernel 2
+
+// __device__ __constant__ h_element dev_h_compact1[46*19];  // used in kernel 1
+// __device__ __constant__ h_element dev_h_compact2[68*30];  // used in kernel 2
+
+// row and col element count
+const char h_ele_row_bg1_count[46] = {
+	19, 19, 19, 19, 3,  8,  9,  7, 10,  9,
+	7,  8,  7,  6,  7,  7,  6,  6,  6,  6,  
+	6,  6,  5,  5,  6,  5,  5,  4,  5,  5,  
+	5,  5,  5,  5,  5,  5,  5,  4,  5,  5,  
+	4,  5,  4,  5,  5,  4};
+const char h_ele_col_bg1_count[68] = {
+	30, 28,  7, 11,  9,  4,  8, 12,  8,  7, 
+	12, 10, 12, 11, 10,  7, 10, 10, 13,  7,  
+	8,  11, 12,  5,  6,  6,  1,  1,  1,  1,  
+	1,   1,  1,  1,  1,  1,  1,  1,  1,  1,  
+	1,   1,  1,  1,  1,  1,  1,  1,  1,  1,  
+	1,   1,  1,  1,  1,  1,  1,  1,  1,  1,  
+	1,   1,  1,  1,  1,  1,  1,  1};
+const char h_ele_row_bg2_count[42] = { 
+	8, 10,  8, 10,  4,  6,  6,  6,  4,  5, 
+	5,  5,  4,  5,  5,  4,  5,  5,  4,  4,
+	4,  4,  3,  4,  4,  3,  5,  3,  4,  3,
+	5,  3,  4,  4,  4,  4,  4,  3,  4,  4, 
+	4,  4};
+const char h_ele_col_bg2_count[52] = {
+	22, 23, 10,  5,  5, 14,  7, 13,  6,  8, 
+	9,  16,  9, 12,  1,  1,  1,  1,  1,  1,
+	1,   1,  1,  1,  1,  1,  1,  1,  1,  1,
+	1,   1,  1,  1,  1,  1,  1,  1,  1,  1,
+	1,   1,  1,  1,  1,  1,  1,  1,  1,  1, 
+	1,   1};
+
+
+
+// Kernel 1
+__kernel void ldpc_cnp_kernel_1st_iter(/*char * dev_llr,*/ int BG, int row, int col, int Zc)
+{
+//	if(blockIdx.x == 0 && threadIdx.x == 1) printf("cnp %d\n", threadIdx.x);
+	int iMCW = blockIdx.y;		// codeword id
+	int iBlkRow = blockIdx.x;	// block row in h_base
+	int iBlkCol;				// block col in h_base
+	int iSubRow = threadIdx.x;	// row index in sub_block of h_base
+	int iCol;					// overall col index in h_base
+	int offsetR;
+	int shift_t;
+
+//	For 2-min algorithm.
+	int Q_sign = 0;
+	int sq;
+	int Q, Q_abs;
+	int R_temp;
+
+	int sign = 1;
+	int rmin1 = INT32_MAX;
+	int rmin2 = INT32_MAX;
+	char idx_min = 0;
+
+	h_element h_element_t;
+	int s = (BG==1)? h_ele_row_bg1_count[iBlkRow]:h_ele_row_bg2_count[iBlkRow];
+	offsetR = (iMCW * row*col*Zc) + iBlkRow * Zc + iSubRow;	// row*col*Zc = size of dev_dt
+//	if(blockIdx.x == 0 && threadIdx.x == 1) printf("s: %d, offset %d\n", s, offsetR);
+//	The 1st recursion
+	for(int i = 0; i < s; i++) // loop through all the ZxZ sub-blocks in a row
+	{
+		h_element_t = dev_h_compact1[i*row+iBlkRow];	// compact_col == row
+
+		iBlkCol = h_element_t.y;
+		shift_t = h_element_t.value;
+
+		shift_t = (iSubRow + shift_t) % Zc;
+		iCol = (iMCW * col*Zc) + iBlkCol * Zc + shift_t;	// col*Zc = size of llr
+		Q = dev_llr[iCol];
+		Q_abs = (Q>0)? Q : -Q;
+		sq = Q < 0;
+//		if(blockIdx.x == 0 && threadIdx.x == 1) printf("i %d, icol %d, Q: %d\n", i, iCol, Q);
+		// quick version
+		sign = sign * (1 - sq * 2);
+		Q_sign |= sq << i;
+
+		if (Q_abs < rmin1){
+			rmin2 = rmin1;
+			rmin1 = Q_abs;
+			idx_min = i;
+		} else if (Q_abs < rmin2){
+			rmin2 = Q_abs;
+		}
+	}
+
+//	if(blockIdx.x == 0 && threadIdx.x == 1)printf("min1 %d, min2 %d, min1_idx %d\n", rmin1, rmin2, idx_min);
+
+//	The 2nd recursion
+	for(int i = 0; i < s; i++){
+		// v0: Best performance so far. 0.75f is the value of alpha.
+		sq = 1 - 2 * ((Q_sign >> i) & 0x01);
+		R_temp = 0.75f * sign * sq * (i != idx_min ? rmin1 : rmin2);
+		// write results to global memory
+		h_element_t = dev_h_compact1[i*row+iBlkRow];
+		int addr_temp = offsetR + h_element_t.y * row * Zc;
+		dev_dt[addr_temp] = R_temp;
+//		if(blockIdx.x == 0 && threadIdx.x == 1)printf("R_temp %d, temp_addr %d\n", R_temp, addr_temp);
+	}	
+}
+
+// Kernel_1
+__kernel void ldpc_cnp_kernel(/*char * dev_llr, char * dev_dt,*/ int BG, int row, int col, int Zc)
+{
+//	if(blockIdx.x == 0 && threadIdx.x == 1) printf("cnp\n");
+	int iMCW = blockIdx.y;
+	int iBlkRow = blockIdx.x;	// block row in h_base
+	int iBlkCol; 				// block col in h_base
+	int iSubRow = threadIdx.x;	// row index in sub_block of h_base
+	int iCol; 					// overall col index in h_base
+	int offsetR;
+	int shift_t;
+
+//	For 2-min algorithm.
+	int Q_sign = 0;
+	int sq;
+	int Q, Q_abs;
+	int R_temp;
+
+	int sign = 1;
+	int rmin1 = INT32_MAX;
+	int rmin2 = INT32_MAX;
+	char idx_min = 0;
+
+	h_element h_element_t;
+	int s = (BG==1)? h_ele_row_bg1_count[iBlkRow]: h_ele_row_bg2_count[iBlkRow];
+	offsetR = (iMCW *row*col*Zc) + iBlkRow * Zc + iSubRow;	// row * col * Zc = size of dev_dt
+//	if(blockIdx.x == 0 && threadIdx.x == 1) printf("s: %d, offset %d\n", s, offsetR);
+//	The 1st recursion
+	for(int i = 0; i < s; i++) // loop through all the ZxZ sub-blocks in a row
+	{
+		h_element_t = dev_h_compact1[i*row+iBlkRow];
+
+		iBlkCol = h_element_t.y;
+		shift_t = h_element_t.value;
+		shift_t = (iSubRow + shift_t) % Zc;
+		iCol = iBlkCol * Zc + shift_t;
+		
+		R_temp = dev_dt[offsetR + iBlkCol * row * Zc];
+
+		Q = dev_llr[iMCW * (col*Zc) + iCol] - R_temp;
+		Q_abs = (Q>0)? Q : -Q;
+//		if(blockIdx.x == 0 && threadIdx.x == 1) printf("i %d, icol %d, Q: %d\n", i, iCol, Q);
+		sq = Q < 0;
+		sign = sign * (1 - sq * 2);
+		Q_sign |= sq << i;
+
+		if (Q_abs < rmin1){
+			rmin2 = rmin1;
+			rmin1 = Q_abs;
+			idx_min = i;
+		} else if (Q_abs < rmin2){
+			rmin2 = Q_abs;
+		}
+		
+	}
+
+//	if(blockIdx.x == 0 && threadIdx.x == 1)printf("min1 %d, min2 %d, min1_idx %d\n", rmin1, rmin2, idx_min);
+	
+//	The 2nd recursion
+	for(int i = 0; i < s; i ++){
+		sq = 1 - 2 * ((Q_sign >> i) & 0x01);
+		R_temp = 0.75f * sign * sq * (i != idx_min ? rmin1 : rmin2);
+		
+
+		// write results to global memory
+		h_element_t = dev_h_compact1[i*row+iBlkRow];
+		int addr_temp = h_element_t.y * row * Zc + offsetR;
+		dev_dt[addr_temp] = R_temp;
+//		if(blockIdx.x == 0 && threadIdx.x == 1)printf("R_temp %d, temp_addr %d\n", R_temp, addr_temp);
+	}
+}
+
+// Kernel 2: VNP processing
+__kernel void
+ldpc_vnp_kernel_normal(/*char * dev_llr, char * dev_dt, char * dev_const_llr,*/ int BG, int row, int col, int Zc)
+{	
+	int iMCW = blockIdx.y;
+	int iBlkCol = blockIdx.x;
+	int iBlkRow;
+	int iSubCol = threadIdx.x;
+	int iRow;
+	int iCol;
+
+	int shift_t, sf;
+	int APP;
+
+	h_element h_element_t;
+
+	// update all the llr values
+	iCol = iBlkCol * Zc + iSubCol;
+	APP = dev_const_llr[iMCW *col*Zc + iCol];
+	int offsetDt = iMCW *row*col*Zc + iBlkCol * row * Zc;
+	int s = (BG==1)? h_ele_col_bg1_count[iBlkCol]:h_ele_col_bg2_count[iBlkCol];
+	
+	for(int i = 0; i < s; i++)
+	{
+		h_element_t = dev_h_compact2[i*col+iBlkCol];
+
+		shift_t = h_element_t.value%Zc;
+		iBlkRow = h_element_t.x;
+
+		sf = iSubCol - shift_t;
+		sf = (sf + Zc) % Zc;
+
+		iRow = iBlkRow * Zc + sf;
+		APP = APP + dev_dt[offsetDt + iRow];
+	}
+	if(APP > SCHAR_MAX)	APP = SCHAR_MAX;
+	if(APP < SCHAR_MIN)	APP = SCHAR_MIN;
+	// write back to device global memory
+	dev_llr[iMCW *col*Zc + iCol] = APP;
+}
+
+
+__kernel void pack_decoded_bit(/*char *dev, unsigned char *host,*/ int col, int Zc)
+{
+	__shared__ unsigned char tmp[128];
+	int iMCW = blockIdx.y;
+	int tid = iMCW * col*Zc + blockIdx.x*128 + threadIdx.x;
+	int btid = threadIdx.x;
+	tmp[btid] = 0;
+	
+	if(dev_llr[tid] < 0){
+		tmp[btid] = 1 << (7-(btid&7));
+	}
+	__syncthreads();
+	
+	if(threadIdx.x < 16){
+		dev_tmp[iMCW * col*Zc + blockIdx.x*16+threadIdx.x] = 0;
+		for(int i = 0; i < 8; i++){
+			dev_tmp[iMCW * col*Zc + blockIdx.x*16+threadIdx.x] += tmp[threadIdx.x*8+i];
+		}
+	}
+}
+