Commit 1cd4c870 authored by frtabu's avatar frtabu

Add -v option in ldpctest to select ldpc shared lib version

going on with openCL ldpc implem
parent 3586e83f
......@@ -503,11 +503,10 @@ int main(int argc, char *argv[])
unsigned int errors, errors_bit, crc_misses;
double errors_bit_uncoded;
short block_length=8448; // decoder supports length: 1201 -> 1280, 2401 -> 2560
char *ldpc_version=NULL; /* version of the ldpc decoder library to use (XXX suffix to use when loading libldpc_XXX.so */
short No_iteration=5;
int n_segments=1;
//double rate=0.333;
short run_cuda = 0;
int nom_rate=1;
int denom_rate=3;
......@@ -527,7 +526,7 @@ int main(int argc, char *argv[])
short BG=0,Zc,Kb=0;
while ((c = getopt (argc, argv, "q:r:s:S:l:G:n:d:i:t:u:h")) != -1)
while ((c = getopt (argc, argv, "q:r:s:S:l:G:n:d:i:t:u:hv:")) != -1)
switch (c)
{
case 'q':
......@@ -547,7 +546,7 @@ int main(int argc, char *argv[])
break;
case 'G':
run_cuda = atoi(optarg);
ldpc_version="_cuda";
break;
case 'n':
......@@ -573,7 +572,9 @@ int main(int argc, char *argv[])
case 'u':
test_uncoded = atoi(optarg);
break;
case 'v':
ldpc_version=strdup(optarg);
break;
case 'h':
default:
printf("CURRENTLY SUPPORTED CODE RATES: \n");
......@@ -592,6 +593,7 @@ int main(int argc, char *argv[])
printf("-t SNR simulation step, Default: 0.1\n");
printf("-i Max decoder iterations, Default: 5\n");
printf("-u Set SNR per coded bit, Default: 0\n");
printf("-v XXX Set ldpc shared library version. libldpc_XXX.so will be used \n");
exit(1);
break;
}
......@@ -601,8 +603,9 @@ int main(int argc, char *argv[])
printf("n_trials %d: \n", n_trials);
printf("SNR0 %f: \n", SNR0);
if(run_cuda)
load_nrLDPClib("_cuda");
if (ldpc_version != NULL)
load_nrLDPClib(ldpc_version);
else
load_nrLDPClib(NULL);
load_nrLDPClib_ref("_orig", &encoder_orig);
......
......@@ -53,13 +53,24 @@ typedef struct{
size_t *max_WIS;
} ocldev_t;
typedef struct {
cl_kernel cnp_kernel_1st;
cl_kernel cnp_kernel;
cl_kernel vnp_kernel_normal;
cl_kernel pack_decoded_bit;
} oclkernels_t;
typedef struct{
cl_uint num_devices;
cl_device_id devices[MAX_OCLDEV];
ocldev_t ocldev[MAX_OCLDEV];
cl_context context;
cl_program program;
cl_kernel kernel;
cl_mem dev_const_llr;
cl_mem dev_llr;
cl_mem dev_dt;
cl_mem dev_tmp;
oclkernels_t kernels[MAX_OCLDEV];
cl_command_queue queue[MAX_OCLDEV];
} oclruntime_t;
......@@ -189,6 +200,14 @@ int ldpc_autoinit(void) { // called by the library loader
ocl.runtime[i].queue[dev] = clCreateCommandQueueWithProperties(ocl.runtime[i].context,ocl.runtime[i].devices[dev] , 0, (cl_int *)&rt);
AssertFatal(rt == CL_SUCCESS, "Error %d creating command queue for platform %i device %i\n" , (int)rt, i,dev);
}
ocl.runtime[i].dev_const_llr = clCreateBuffer(ocl.runtime[i].context, CL_MEM_READ_ONLY|CL_MEM_HOST_WRITE_ONLY, 68*384, NULL, (cl_int *)&rt);
AssertFatal(rt == CL_SUCCESS, "Error %d creating buffer dev_const_llr for platform %i \n" , (int)rt, i);
ocl.runtime[i].dev_llr = clCreateBuffer(ocl.runtime[i].context, CL_MEM_READ_WRITE|CL_MEM_HOST_WRITE_ONLY, 68*384, NULL, (cl_int *)&rt);
AssertFatal(rt == CL_SUCCESS, "Error %d creating buffer dev_llr for platform %i \n" , (int)rt, i);
ocl.runtime[i].dev_dt = clCreateBuffer(ocl.runtime[i].context, CL_MEM_READ_WRITE|CL_MEM_HOST_NO_ACCESS, 46*68*384, NULL, (cl_int *)&rt);
AssertFatal(rt == CL_SUCCESS, "Error %d creating buffer dev_dt for platform %i \n" , (int)rt, i);
ocl.runtime[i].dev_tmp = clCreateBuffer(ocl.runtime[i].context, CL_MEM_READ_ONLY|CL_MEM_HOST_WRITE_ONLY, 68*384, NULL, (cl_int *)&rt);
AssertFatal(rt == CL_SUCCESS, "Error %d creating buffer dev_tmp for platform %i \n" , (int)rt, i);
char *source_str;
size_t source_size=load_source(&source_str);
cl_program program = clCreateProgramWithSource(ocl.runtime[i].context, 1,
......@@ -199,6 +218,17 @@ int ldpc_autoinit(void) { // called by the library loader
get_CompilErr(program,i);
}
AssertFatal(rt == CL_SUCCESS, "Error %d buildding program for platform %i \n" , rt, i);
for(int dev=0; dev<ocl.runtime[i].num_devices; dev++) {
ocl.runtime[i].kernels[dev].cnp_kernel_1st = clCreateKernel(program, "ldpc_cnp_kernel_1st_iter", (cl_int *)&rt);
AssertFatal(rt == CL_SUCCESS, "Error %d creating kernel %s platform %i, dev %i\n" , (int)rt,"ldpc_cnp_kernel_1st_iter", i,dev);
ocl.runtime[i].kernels[dev].cnp_kernel = clCreateKernel(program, "ldpc_cnp_kernel", (cl_int *)&rt);
AssertFatal(rt == CL_SUCCESS, "Error %d creating kernel %s platform %i, dev %i\n" , (int)rt,"ldpc_cnp_kernel", i,dev);
ocl.runtime[i].kernels[dev].vnp_kernel_normal = clCreateKernel(program, "ldpc_vnp_kernel_normal", (cl_int *)&rt);
AssertFatal(rt == CL_SUCCESS, "Error %d creating kernel %s platform %i, dev %i\n" , (int)rt,"ldpc_vnp_kernel_normal", i,dev);
ocl.runtime[i].kernels[dev].pack_decoded_bit = clCreateKernel(program, "pack_decoded_bit", (cl_int *)&rt);
AssertFatal(rt == CL_SUCCESS, "Error %d creating kernel %s platform %i, dev %i\n" , (int)rt,"pack_decoded_bit", i,dev);
}
context_ok++;
}
devok=0;
......@@ -234,10 +264,15 @@ int32_t nrLDPC_decod(t_nrLDPC_dec_params* p_decParams, int8_t* p_llr, int8_t* p_
// alloc memory
unsigned char *hard_decision = (unsigned char*)p_out;
// gpu
int memorySize_llr_cuda = col * Zc * sizeof(char) * MC;
int memorySize_llr = col * Zc * sizeof(char) * MC;
// cudaCheck( cudaMemcpyToSymbol(dev_const_llr, p_llr, memorySize_llr_cuda) );
// cudaCheck( cudaMemcpyToSymbol(dev_llr, p_llr, memorySize_llr_cuda) );
int rt = clEnqueueWriteBuffer(ocl.runtime[0].queue[0], ocl.runtime[0].dev_const_llr, CL_TRUE, 0,
memorySize_llr, p_llr, 0, NULL, NULL);
AssertFatal(rt == CL_SUCCESS, "Error %d moving p_llr data to read only memory in pltf %i dev %i\n" , (int)rt, 0,0);
rt = clEnqueueWriteBuffer(ocl.runtime[0].queue[0], ocl.runtime[0].dev_llr, CL_TRUE, 0,
memorySize_llr, p_llr, 0, NULL, NULL);
AssertFatal(rt == CL_SUCCESS, "Error %d moving p_llr data to read-write memory in pltf %i dev %i\n" , (int)rt, 0,0);
// Define CUDA kernel dimension
int blockSizeX = Zc;
// dim3 dimGridKernel1(row, MC, 1); // dim of the thread blocks
......@@ -266,15 +301,26 @@ int32_t nrLDPC_decod(t_nrLDPC_dec_params* p_decParams, int8_t* p_llr, int8_t* p_
// (dev_llr, dev_const_llr,BG, row, col, Zc);
(BG, row, col, Zc);
}
int pack = (block_length/128)+1;
dim3 pack_block(pack, MC, 1);
pack_decoded_bit<<<pack_block,128>>>( col, Zc);
cudaCheck( cudaMemcpyFromSymbol((void*)hard_decision, (const void*)dev_tmp, (block_length/8)*sizeof(unsigned char)) );
cudaDeviceSynchronize();
*/
// int pack = (block_length/128)+1;
// dim3 pack_block(pack, MC, 1);
// pack_decoded_bit<<<pack_block,128>>>( col, Zc);
rt = clSetKernelArg(ocl.runtime[0].kernels[0].pack_decoded_bit, 0, sizeof(cl_mem), (void *)&(ocl.runtime[0].dev_llr));
rt = clSetKernelArg(ocl.runtime[0].kernels[0].pack_decoded_bit, 1, sizeof(cl_mem), (void *)&(ocl.runtime[0].dev_tmp));
rt = clSetKernelArg(ocl.runtime[0].kernels[0].pack_decoded_bit, 2, sizeof(int), (void *)&(col));
rt = clSetKernelArg(ocl.runtime[0].kernels[0].pack_decoded_bit, 3, sizeof(int), (void *)&(Zc));
// Execute the OpenCL kernel on the list
size_t global_item_size = block_length; // Process the entire lists
size_t local_item_size = 128; // Divide work items into groups of 128
rt = clEnqueueNDRangeKernel(ocl.runtime[0].queue[0], ocl.runtime[0].kernels[0].pack_decoded_bit, 2, NULL,
&global_item_size, &local_item_size, 0, NULL, NULL);
// cudaCheck( cudaMemcpyFromSymbol((void*)hard_decision, (const void*)dev_tmp, (block_length/8)*sizeof(unsigned char)) );
// cudaDeviceSynchronize();
rt = clEnqueueReadBuffer(ocl.runtime[0].queue[0], ocl.runtime[0].dev_tmp, CL_TRUE, 0,
(block_length/8)*sizeof(unsigned char) , p_llr, 0, NULL, NULL);
AssertFatal(rt == CL_SUCCESS, "Error %d moving p_llr data to pltf %i dev %i\n" , (int)rt, 0,0);
return MAX_ITERATION;
}
......@@ -10,7 +10,6 @@ typedef struct{
short value;
} h_element;
__constant char dev_const_llr[68*384]= {};
//__global char dev_dt [46*68*384];
//__local char *dev_t;
//__global char dev_llr[68*384];
......@@ -239,9 +238,9 @@ ldpc_vnp_kernel_normal(__local char * dev_llr, __local char * dev_dt, /* char *
}
__kernel void pack_decoded_bit(__local unsigned char * dev_llr, __local unsigned char * dev_tmp, int col, int Zc)
__kernel void pack_decoded_bit(__global unsigned char * dev_llr, __global unsigned char * dev_tmp, int col, int Zc)
{
__local unsigned char tmp[128];
unsigned char tmp[128];
int iMCW = get_group_id(1);
int btid = get_local_id(0);
int tid = iMCW * col*Zc + get_group_id(0)*128 + btid;
......
Markdown is supported
0%
or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment