Commit f178a5f0 authored by frtabu's avatar frtabu

opencl implementation of ldpc decoder (platform query)

parent 60a91939
......@@ -40,7 +40,7 @@
#define MC 1
#define MAX_OCLDEV 10
#define MAX_OCLRUNTIME 5
typedef struct{
char x;
char y;
......@@ -51,14 +51,20 @@ typedef struct{
cl_uint max_CU;
cl_uint max_WID;
size_t *max_WIS;
} ocldev_t;
typedef struct{
cl_uint num_devices;
cl_device_id devices[MAX_OCLDEV];
ocldev_t ocldev[MAX_OCLDEV];
cl_context context;
cl_program program;
cl_kernel kernel;
cl_command_queue queue;
} ocldev_t;
cl_command_queue queue[MAX_OCLDEV];
} oclruntime_t;
typedef struct{
ocldev_t ocldev[MAX_OCLDEV];
oclruntime_t runtime[MAX_OCLRUNTIME];
} ocl_t;
......@@ -86,7 +92,8 @@ void init_LLR_DMA(t_nrLDPC_dec_params* p_decParams, int8_t* p_llr, int8_t* p_out
// cudaDeviceSynchronize();
}
cl_error_callback(const char* errinfo, const void* private_info, size_t cb, void* user_data) {
void cl_error_callback(const char* errinfo, const void* private_info, size_t cb, void* user_data) {
oclruntime_t *runtime = (oclruntime_t *)user_data;
LOG_E(HW,"OpenCL accelerator error %s\n", errinfo );
}
......@@ -103,11 +110,25 @@ char *clutil_getstrdev(int intdev) {
return retstring;
}
size_t load_source(char **source_str) {
int MAX_SOURCE_SIZE=(500*132);
FILE *fp;
size_t source_size;
fp = fopen("nrLDPC_decoder_kernels_CL.cl", "r");
AssertFatal(fp,"failed to open cl source: %s\n",strerror(errno));
*source_str = (char*)malloc(MAX_SOURCE_SIZE);
source_size = fread( *source_str, 1, MAX_SOURCE_SIZE, fp);
fclose( fp );
return source_size;
}
/* from here: entry points in decoder shared lib */
int ldpc_autoinit(void) { // called by the library loader
cl_platform_id platforms[10];
cl_uint num_platforms_found;
int context_ok=0;
cl_uint rt = clGetPlatformIDs( sizeof(platforms)/sizeof(cl_platform_id), platforms, &num_platforms_found );
AssertFatal(rt == CL_SUCCESS, "clGetPlatformIDs error %d\n" , (int)rt);
AssertFatal( num_platforms_found>0 , "clGetPlatformIDs: no cl compatible platform found\n");
......@@ -119,33 +140,50 @@ int ldpc_autoinit(void) { // called by the library loader
rt = clGetPlatformInfo(platforms[i],CL_PLATFORM_VERSION, sizeof(stringval),stringval,NULL);
AssertFatal(rt == CL_SUCCESS, "clGetPlatformInfo VERSION error %d\n" , (int)rt);
LOG_I(HW,"Platform %i, OpenCL version %s\n", i,stringval );
cl_device_id devices[20];
cl_uint num_devices_found;
rt = clGetDeviceIDs(platforms[i],CL_DEVICE_TYPE_ALL, sizeof(devices)/sizeof(cl_device_id),devices,&num_devices_found);
rt = clGetDeviceIDs(platforms[i],CL_DEVICE_TYPE_ALL, sizeof(ocl.runtime[i].devices)/sizeof(cl_device_id),ocl.runtime[i].devices,&(ocl.runtime[i].num_devices));
AssertFatal(rt == CL_SUCCESS, "clGetDeviceIDs error %d\n" , (int)rt);
for (int j=0; j<num_devices_found; j++) {
int devok=0;
for (int j=0; j<ocl.runtime[i].num_devices; j++) {
cl_bool abool;
rt = clGetDeviceInfo(devices[j],CL_DEVICE_AVAILABLE, sizeof(abool),&abool,NULL);
rt = clGetDeviceInfo(ocl.runtime[i].devices[j],CL_DEVICE_AVAILABLE, sizeof(abool),&abool,NULL);
AssertFatal(rt == CL_SUCCESS, "clGetDeviceInfo DEVICE_AVAILABLE error %d\n" , (int)rt);
LOG_I(HW,"Device %i is %s available\n", j, (abool==CL_TRUE?"":"not"));
cl_device_type devtype;
rt = clGetDeviceInfo(devices[j],CL_DEVICE_TYPE, sizeof(cl_device_type),&devtype,NULL);
rt = clGetDeviceInfo(ocl.runtime[i].devices[j],CL_DEVICE_TYPE, sizeof(cl_device_type),&devtype,NULL);
AssertFatal(rt == CL_SUCCESS, "clGetDeviceInfo DEVICE_TYPE error %d\n" , (int)rt);
LOG_I(HW,"Device %i, type %d = %s\n", j,(int)devtype, clutil_getstrdev(devtype));
rt = clGetDeviceInfo(devices[j],CL_DEVICE_MAX_COMPUTE_UNITS,sizeof(ocl.ocldev[j].max_CU),&(ocl.ocldev[j].max_CU),NULL);
rt = clGetDeviceInfo(ocl.runtime[i].devices[j],CL_DEVICE_MAX_COMPUTE_UNITS,sizeof(ocl.runtime[i].ocldev[j].max_CU),&(ocl.runtime[i].ocldev[j].max_CU),NULL);
AssertFatal(rt == CL_SUCCESS, "clGetDeviceInfo MAX_COMPUTE_UNITS error %d\n" , (int)rt);
LOG_I(HW,"Device %i, number of Compute Units: %d\n", j,ocl.ocldev[j].max_CU);
rt = clGetDeviceInfo(devices[j],CL_DEVICE_MAX_WORK_ITEM_DIMENSIONS,sizeof(ocl.ocldev[j].max_WID),&(ocl.ocldev[j].max_WID),NULL);
LOG_I(HW,"Device %i, number of Compute Units: %d\n", j,ocl.runtime[i].ocldev[j].max_CU);
rt = clGetDeviceInfo(ocl.runtime[i].devices[j],CL_DEVICE_MAX_WORK_ITEM_DIMENSIONS,sizeof(ocl.runtime[i].ocldev[j].max_WID),&(ocl.runtime[i].ocldev[j].max_WID),NULL);
AssertFatal(rt == CL_SUCCESS, "clGetDeviceInfo MAX_WORK_ITEM_DIMENSIONS error %d\n" , (int)rt);
LOG_I(HW,"Device %i, max Work Items dimension: %d\n", j,ocl.ocldev[j].max_WID);
ocl.ocldev[j].max_WIS = (size_t *)malloc(ocl.ocldev[j].max_WID * sizeof(size_t));
rt = clGetDeviceInfo(devices[j],CL_DEVICE_MAX_WORK_ITEM_SIZES,sizeof(ocl.ocldev[j].max_WID)*sizeof(size_t),ocl.ocldev[j].max_WIS,NULL);
LOG_I(HW,"Device %i, max Work Items dimension: %d\n", j,ocl.runtime[i].ocldev[j].max_WID);
ocl.runtime[i].ocldev[j].max_WIS = (size_t *)malloc(ocl.runtime[i].ocldev[j].max_WID * sizeof(size_t));
rt = clGetDeviceInfo(ocl.runtime[i].devices[j],CL_DEVICE_MAX_WORK_ITEM_SIZES,sizeof(ocl.runtime[i].ocldev[j].max_WID)*sizeof(size_t),ocl.runtime[i].ocldev[j].max_WIS,NULL);
AssertFatal(rt == CL_SUCCESS, "clGetDeviceInfo MAX_WORK_ITEM_SIZES error %d\n" , (int)rt);
for(int k=0; k<ocl.ocldev[j].max_WID;k++)
LOG_I(HW,"Device %i, max Work Items size for dimension: %d %u\n", j,k,(uint32_t)ocl.ocldev[j].max_WIS[k]);
for(int k=0; k<ocl.runtime[i].ocldev[j].max_WID;k++)
LOG_I(HW,"Device %i, max Work Items size for dimension: %d %u\n", j,k,(uint32_t)ocl.runtime[i].ocldev[j].max_WIS[k]);
devok++;
}
if (devok >0) {
ocl.runtime[i].context = clCreateContext(NULL, ocl.runtime[i].num_devices, ocl.runtime[i].devices, cl_error_callback, &(ocl.runtime[i]), (cl_int *)&rt);
AssertFatal(rt == CL_SUCCESS, "Error %d creating context for platform %i\n" , (int)rt, i);
for(int dev=0; dev<ocl.runtime[i].num_devices; dev++) {
ocl.runtime[i].queue[dev] = clCreateCommandQueueWithProperties(ocl.runtime[i].context,ocl.runtime[i].devices[dev] , 0, (cl_int *)&rt);
AssertFatal(rt == CL_SUCCESS, "Error %d creating command queue for platform %i device %i\n" , (int)rt, i,dev);
}
char *source_str;
size_t source_size=load_source(&source_str);
cl_program program = clCreateProgramWithSource(ocl.runtime[i].context, 1,
(const char **)&source_str, (const size_t *)&source_size, (cl_int *)&rt);
AssertFatal(rt == CL_SUCCESS, "Error %d creating program for platform %i \n" , (int)rt, i);
rt = clBuildProgram(program, ocl.runtime[i].num_devices,ocl.runtime[i].devices, NULL, NULL, NULL);
AssertFatal(rt == CL_SUCCESS, "Error %d buildding program for platform %i \n" , rt, i);
context_ok++;
}
devok=0;
}
AssertFatal(context_ok>0, "No openCL device available to accelerate ldpc\n");
return 0;
}
......
define MAX_ITERATION 2
#define MC 1
typedef struct{
char x;
char y;
short value;
} h_element;
char dev_const_llr[68*384];
char dev_dt [46*68*384];
char dev_llr[68*384];
unsigned char dev_tmp[68*384];
h_element h_compact1 [46*19] = {};
h_element h_compact2 [68*30] = {};
h_element dev_h_compact1[46*19]; // used in kernel 1
h_element dev_h_compact2[68*30]; // used in kernel 2
// __device__ __constant__ h_element dev_h_compact1[46*19]; // used in kernel 1
// __device__ __constant__ h_element dev_h_compact2[68*30]; // used in kernel 2
// row and col element count
const char h_ele_row_bg1_count[46] = {
19, 19, 19, 19, 3, 8, 9, 7, 10, 9,
7, 8, 7, 6, 7, 7, 6, 6, 6, 6,
6, 6, 5, 5, 6, 5, 5, 4, 5, 5,
5, 5, 5, 5, 5, 5, 5, 4, 5, 5,
4, 5, 4, 5, 5, 4};
const char h_ele_col_bg1_count[68] = {
30, 28, 7, 11, 9, 4, 8, 12, 8, 7,
12, 10, 12, 11, 10, 7, 10, 10, 13, 7,
8, 11, 12, 5, 6, 6, 1, 1, 1, 1,
1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
1, 1, 1, 1, 1, 1, 1, 1};
const char h_ele_row_bg2_count[42] = {
8, 10, 8, 10, 4, 6, 6, 6, 4, 5,
5, 5, 4, 5, 5, 4, 5, 5, 4, 4,
4, 4, 3, 4, 4, 3, 5, 3, 4, 3,
5, 3, 4, 4, 4, 4, 4, 3, 4, 4,
4, 4};
const char h_ele_col_bg2_count[52] = {
22, 23, 10, 5, 5, 14, 7, 13, 6, 8,
9, 16, 9, 12, 1, 1, 1, 1, 1, 1,
1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
1, 1};
// Kernel 1
__kernel void ldpc_cnp_kernel_1st_iter(/*char * dev_llr,*/ int BG, int row, int col, int Zc)
{
// if(blockIdx.x == 0 && threadIdx.x == 1) printf("cnp %d\n", threadIdx.x);
int iMCW = blockIdx.y; // codeword id
int iBlkRow = blockIdx.x; // block row in h_base
int iBlkCol; // block col in h_base
int iSubRow = threadIdx.x; // row index in sub_block of h_base
int iCol; // overall col index in h_base
int offsetR;
int shift_t;
// For 2-min algorithm.
int Q_sign = 0;
int sq;
int Q, Q_abs;
int R_temp;
int sign = 1;
int rmin1 = INT32_MAX;
int rmin2 = INT32_MAX;
char idx_min = 0;
h_element h_element_t;
int s = (BG==1)? h_ele_row_bg1_count[iBlkRow]:h_ele_row_bg2_count[iBlkRow];
offsetR = (iMCW * row*col*Zc) + iBlkRow * Zc + iSubRow; // row*col*Zc = size of dev_dt
// if(blockIdx.x == 0 && threadIdx.x == 1) printf("s: %d, offset %d\n", s, offsetR);
// The 1st recursion
for(int i = 0; i < s; i++) // loop through all the ZxZ sub-blocks in a row
{
h_element_t = dev_h_compact1[i*row+iBlkRow]; // compact_col == row
iBlkCol = h_element_t.y;
shift_t = h_element_t.value;
shift_t = (iSubRow + shift_t) % Zc;
iCol = (iMCW * col*Zc) + iBlkCol * Zc + shift_t; // col*Zc = size of llr
Q = dev_llr[iCol];
Q_abs = (Q>0)? Q : -Q;
sq = Q < 0;
// if(blockIdx.x == 0 && threadIdx.x == 1) printf("i %d, icol %d, Q: %d\n", i, iCol, Q);
// quick version
sign = sign * (1 - sq * 2);
Q_sign |= sq << i;
if (Q_abs < rmin1){
rmin2 = rmin1;
rmin1 = Q_abs;
idx_min = i;
} else if (Q_abs < rmin2){
rmin2 = Q_abs;
}
}
// if(blockIdx.x == 0 && threadIdx.x == 1)printf("min1 %d, min2 %d, min1_idx %d\n", rmin1, rmin2, idx_min);
// The 2nd recursion
for(int i = 0; i < s; i++){
// v0: Best performance so far. 0.75f is the value of alpha.
sq = 1 - 2 * ((Q_sign >> i) & 0x01);
R_temp = 0.75f * sign * sq * (i != idx_min ? rmin1 : rmin2);
// write results to global memory
h_element_t = dev_h_compact1[i*row+iBlkRow];
int addr_temp = offsetR + h_element_t.y * row * Zc;
dev_dt[addr_temp] = R_temp;
// if(blockIdx.x == 0 && threadIdx.x == 1)printf("R_temp %d, temp_addr %d\n", R_temp, addr_temp);
}
}
// Kernel_1
__kernel void ldpc_cnp_kernel(/*char * dev_llr, char * dev_dt,*/ int BG, int row, int col, int Zc)
{
// if(blockIdx.x == 0 && threadIdx.x == 1) printf("cnp\n");
int iMCW = blockIdx.y;
int iBlkRow = blockIdx.x; // block row in h_base
int iBlkCol; // block col in h_base
int iSubRow = threadIdx.x; // row index in sub_block of h_base
int iCol; // overall col index in h_base
int offsetR;
int shift_t;
// For 2-min algorithm.
int Q_sign = 0;
int sq;
int Q, Q_abs;
int R_temp;
int sign = 1;
int rmin1 = INT32_MAX;
int rmin2 = INT32_MAX;
char idx_min = 0;
h_element h_element_t;
int s = (BG==1)? h_ele_row_bg1_count[iBlkRow]: h_ele_row_bg2_count[iBlkRow];
offsetR = (iMCW *row*col*Zc) + iBlkRow * Zc + iSubRow; // row * col * Zc = size of dev_dt
// if(blockIdx.x == 0 && threadIdx.x == 1) printf("s: %d, offset %d\n", s, offsetR);
// The 1st recursion
for(int i = 0; i < s; i++) // loop through all the ZxZ sub-blocks in a row
{
h_element_t = dev_h_compact1[i*row+iBlkRow];
iBlkCol = h_element_t.y;
shift_t = h_element_t.value;
shift_t = (iSubRow + shift_t) % Zc;
iCol = iBlkCol * Zc + shift_t;
R_temp = dev_dt[offsetR + iBlkCol * row * Zc];
Q = dev_llr[iMCW * (col*Zc) + iCol] - R_temp;
Q_abs = (Q>0)? Q : -Q;
// if(blockIdx.x == 0 && threadIdx.x == 1) printf("i %d, icol %d, Q: %d\n", i, iCol, Q);
sq = Q < 0;
sign = sign * (1 - sq * 2);
Q_sign |= sq << i;
if (Q_abs < rmin1){
rmin2 = rmin1;
rmin1 = Q_abs;
idx_min = i;
} else if (Q_abs < rmin2){
rmin2 = Q_abs;
}
}
// if(blockIdx.x == 0 && threadIdx.x == 1)printf("min1 %d, min2 %d, min1_idx %d\n", rmin1, rmin2, idx_min);
// The 2nd recursion
for(int i = 0; i < s; i ++){
sq = 1 - 2 * ((Q_sign >> i) & 0x01);
R_temp = 0.75f * sign * sq * (i != idx_min ? rmin1 : rmin2);
// write results to global memory
h_element_t = dev_h_compact1[i*row+iBlkRow];
int addr_temp = h_element_t.y * row * Zc + offsetR;
dev_dt[addr_temp] = R_temp;
// if(blockIdx.x == 0 && threadIdx.x == 1)printf("R_temp %d, temp_addr %d\n", R_temp, addr_temp);
}
}
// Kernel 2: VNP processing
__kernel void
ldpc_vnp_kernel_normal(/*char * dev_llr, char * dev_dt, char * dev_const_llr,*/ int BG, int row, int col, int Zc)
{
int iMCW = blockIdx.y;
int iBlkCol = blockIdx.x;
int iBlkRow;
int iSubCol = threadIdx.x;
int iRow;
int iCol;
int shift_t, sf;
int APP;
h_element h_element_t;
// update all the llr values
iCol = iBlkCol * Zc + iSubCol;
APP = dev_const_llr[iMCW *col*Zc + iCol];
int offsetDt = iMCW *row*col*Zc + iBlkCol * row * Zc;
int s = (BG==1)? h_ele_col_bg1_count[iBlkCol]:h_ele_col_bg2_count[iBlkCol];
for(int i = 0; i < s; i++)
{
h_element_t = dev_h_compact2[i*col+iBlkCol];
shift_t = h_element_t.value%Zc;
iBlkRow = h_element_t.x;
sf = iSubCol - shift_t;
sf = (sf + Zc) % Zc;
iRow = iBlkRow * Zc + sf;
APP = APP + dev_dt[offsetDt + iRow];
}
if(APP > SCHAR_MAX) APP = SCHAR_MAX;
if(APP < SCHAR_MIN) APP = SCHAR_MIN;
// write back to device global memory
dev_llr[iMCW *col*Zc + iCol] = APP;
}
__kernel void pack_decoded_bit(/*char *dev, unsigned char *host,*/ int col, int Zc)
{
__shared__ unsigned char tmp[128];
int iMCW = blockIdx.y;
int tid = iMCW * col*Zc + blockIdx.x*128 + threadIdx.x;
int btid = threadIdx.x;
tmp[btid] = 0;
if(dev_llr[tid] < 0){
tmp[btid] = 1 << (7-(btid&7));
}
__syncthreads();
if(threadIdx.x < 16){
dev_tmp[iMCW * col*Zc + blockIdx.x*16+threadIdx.x] = 0;
for(int i = 0; i < 8; i++){
dev_tmp[iMCW * col*Zc + blockIdx.x*16+threadIdx.x] += tmp[threadIdx.x*8+i];
}
}
}
Markdown is supported
0%
or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment