Fix CUDA latency

96023eeb · NCTU OpinConnect Terng-Yin Hsu/WEI-YING,LIN · 61bdea75 · f7dc17ec · 96023eeb · 96023eeb
Commit 96023eeb authored Aug 06, 2020 by NCTU OpinConnect Terng-Yin Hsu/WEI-YING,LIN
3 changed files
--- a/openair1/PHY/CODING/TESTBENCH/ldpctest.c
+++ b/openair1/PHY/CODING/TESTBENCH/ldpctest.c
@@ -402,9 +402,17 @@ int test_ldpc(short No_iteration,
    	  start_meas(time_decoder);
 #ifdef CUDA_FLAG
        if(run_cuda){
+<<<<<<< HEAD
          n_iter = nrLDPC_decoder_LYC(&decParams, (int8_t*)channel_output_fixed[j], (int8_t*)estimated_output[j], block_length, time_decoder);
        }  
        else{ 
+=======
+          printf("***********run ldpc by cuda\n");
+          n_iter = nrLDPC_decoder_LYC(&decParams, (int8_t*)channel_output_fixed[j], (int8_t*)estimated_output[j], block_length, time_decoder);
+        }  
+        else{ 
+          printf("**************run ldpc by cpu\n");  
+>>>>>>> origin/develop
        // decode the sequence
        // decoder supports BG2, Z=128 & 256
        //esimated_output=ldpc_decoder(channel_output_fixed, block_length, No_iteration, (double)((float)nom_rate/(float)denom_rate));

--- a/openair1/PHY/CODING/nrLDPC_decoder_LYC/nrLDPC_decoder_LYC.cu
+++ b/openair1/PHY/CODING/nrLDPC_decoder_LYC/nrLDPC_decoder_LYC.cu
@@ -32,7 +32,11 @@
 #include "bgs/BG2_I6"
 #include "bgs/BG2_I7"

+<<<<<<< HEAD
 #define MAX_ITERATION 2
+=======
+#define MAX_ITERATION 5
+>>>>>>> origin/develop
 #define MC	1

 #define cudaCheck(ans) { cudaAssert((ans), __FILE__, __LINE__); }
@@ -49,21 +53,30 @@ typedef struct{
  char y;
  short value;
 } h_element;
+<<<<<<< HEAD
 #include "bgs/BG1_compact_in_C.h"

 __device__ char dev_const_llr[68*384];
 __device__ char dev_dt [46*68*384];
 __device__ char dev_llr[68*384];
 __device__ unsigned char dev_tmp[68*384];
+=======
+
+>>>>>>> origin/develop

 h_element h_compact1 [46*19] = {};
 h_element h_compact2 [68*30] = {};

+<<<<<<< HEAD
 __device__  h_element dev_h_compact1[46*19];  // used in kernel 1
 __device__  h_element dev_h_compact2[68*30];  // used in kernel 2

 // __device__ __constant__ h_element dev_h_compact1[46*19];  // used in kernel 1
 // __device__ __constant__ h_element dev_h_compact2[68*30];  // used in kernel 2
+=======
+__device__ __constant__ h_element dev_h_compact1[46*19];  // used in kernel 1
+__device__ __constant__ h_element dev_h_compact2[68*30];  // used in kernel 2
+>>>>>>> origin/develop

 // row and col element count
 __device__ __constant__ char h_ele_row_bg1_count[46] = {
@@ -100,6 +113,7 @@ __global__ void warmup()
 	// warm up gpu for time measurement
 }

+<<<<<<< HEAD
 extern "C"
 void warmup_for_GPU(){
 	
@@ -187,6 +201,11 @@ void set_compact_BG(int Zc,short BG){

 // Kernel 1
 __global__ void ldpc_cnp_kernel_1st_iter(/*char * dev_llr,*/ int BG, int row, int col, int Zc)
+=======
+
+// Kernel 1
+__global__ void ldpc_cnp_kernel_1st_iter(char * dev_llr, char * dev_dt, int BG, int row, int col, int Zc)
+>>>>>>> origin/develop
 {
 //	if(blockIdx.x == 0 && threadIdx.x == 1) printf("cnp %d\n", threadIdx.x);
 	int iMCW = blockIdx.y;		// codeword id
@@ -245,7 +264,11 @@ __global__ void ldpc_cnp_kernel_1st_iter(/*char * dev_llr,*/ int BG, int row, in
 	for(int i = 0; i < s; i++){
 		// v0: Best performance so far. 0.75f is the value of alpha.
 		sq = 1 - 2 * ((Q_sign >> i) & 0x01);
+<<<<<<< HEAD
 		R_temp = 0.75f * sign * sq * (i != idx_min ? rmin1 : rmin2);
+=======
+		R_temp = 0.8 * sign * sq * (i != idx_min ? rmin1 : rmin2);
+>>>>>>> origin/develop
 		// write results to global memory
 		h_element_t = dev_h_compact1[i*row+iBlkRow];
 		int addr_temp = offsetR + h_element_t.y * row * Zc;
@@ -255,7 +278,11 @@ __global__ void ldpc_cnp_kernel_1st_iter(/*char * dev_llr,*/ int BG, int row, in
 }

 // Kernel_1
+<<<<<<< HEAD
 __global__ void ldpc_cnp_kernel(/*char * dev_llr, char * dev_dt,*/ int BG, int row, int col, int Zc)
+=======
+__global__ void ldpc_cnp_kernel(char * dev_llr, char * dev_dt, int BG, int row, int col, int Zc)
+>>>>>>> origin/develop
 {
 //	if(blockIdx.x == 0 && threadIdx.x == 1) printf("cnp\n");
 	int iMCW = blockIdx.y;
@@ -315,7 +342,11 @@ __global__ void ldpc_cnp_kernel(/*char * dev_llr, char * dev_dt,*/ int BG, int r
 //	The 2nd recursion
 	for(int i = 0; i < s; i ++){
 		sq = 1 - 2 * ((Q_sign >> i) & 0x01);
+<<<<<<< HEAD
 		R_temp = 0.75f * sign * sq * (i != idx_min ? rmin1 : rmin2);
+=======
+		R_temp = 0.8 * sign * sq * (i != idx_min ? rmin1 : rmin2);
+>>>>>>> origin/develop
 		

 		// write results to global memory
@@ -328,7 +359,11 @@ __global__ void ldpc_cnp_kernel(/*char * dev_llr, char * dev_dt,*/ int BG, int r

 // Kernel 2: VNP processing
 __global__ void
+<<<<<<< HEAD
 ldpc_vnp_kernel_normal(/*char * dev_llr, char * dev_dt, char * dev_const_llr,*/ int BG, int row, int col, int Zc)
+=======
+ldpc_vnp_kernel_normal(char * dev_llr, char * dev_dt, char * dev_const_llr, int BG, int row, int col, int Zc)
+>>>>>>> origin/develop
 {	
 	int iMCW = blockIdx.y;
 	int iBlkCol = blockIdx.x;
@@ -368,7 +403,11 @@ ldpc_vnp_kernel_normal(/*char * dev_llr, char * dev_dt, char * dev_const_llr,*/
 }


+<<<<<<< HEAD
 __global__ void pack_decoded_bit(/*char *dev, unsigned char *host,*/ int col, int Zc)
+=======
+__global__ void pack_decoded_bit(char *dev, unsigned char *host, int col, int Zc)
+>>>>>>> origin/develop
 {
 	__shared__ unsigned char tmp[128];
 	int iMCW = blockIdx.y;
@@ -376,15 +415,25 @@ __global__ void pack_decoded_bit(/*char *dev, unsigned char *host,*/ int col, in
 	int btid = threadIdx.x;
 	tmp[btid] = 0;
 	
+<<<<<<< HEAD
 	if(dev_llr[tid] < 0){
+=======
+	if(dev[tid] < 0){
+>>>>>>> origin/develop
 		tmp[btid] = 1 << (7-(btid&7));
 	}
 	__syncthreads();
 	
 	if(threadIdx.x < 16){
+<<<<<<< HEAD
 		dev_tmp[iMCW * col*Zc + blockIdx.x*16+threadIdx.x] = 0;
 		for(int i = 0; i < 8; i++){
 			dev_tmp[iMCW * col*Zc + blockIdx.x*16+threadIdx.x] += tmp[threadIdx.x*8+i];
+=======
+		host[iMCW * col*Zc + blockIdx.x*16+threadIdx.x] = 0;
+		for(int i = 0; i < 8; i++){
+			host[iMCW * col*Zc + blockIdx.x*16+threadIdx.x] += tmp[threadIdx.x*8+i];
+>>>>>>> origin/develop
 		}
 	}
 }
@@ -461,6 +510,7 @@ void read_BG(int BG, int *h, int row, int col)
 	*/
 }

+<<<<<<< HEAD
 extern "C"
 void init_LLR_DMA_for_CUDA(t_nrLDPC_dec_params* p_decParams, int8_t* p_llr, int8_t* p_out, int block_length){
 	
@@ -482,17 +532,28 @@ void init_LLR_DMA_for_CUDA(t_nrLDPC_dec_params* p_decParams, int8_t* p_llr, int8
 	cudaDeviceSynchronize();
 	
 }
+=======
+>>>>>>> origin/develop

 extern "C"
 int32_t nrLDPC_decoder_LYC(t_nrLDPC_dec_params* p_decParams, int8_t* p_llr, int8_t* p_out, int block_length, time_stats_t *time_decoder)
 {
+<<<<<<< HEAD

+=======
+	// alloc mem
+	//unsigned char *decision = (unsigned char*)p_out;
+>>>>>>> origin/develop

    uint16_t Zc          = p_decParams->Z;
    uint8_t  BG         = p_decParams->BG;
    uint8_t  numMaxIter = p_decParams->numMaxIter;
    e_nrLDPC_outMode outMode = p_decParams->outMode;
+<<<<<<< HEAD
 	cudaError_t cudaStatus;
+=======
+	
+>>>>>>> origin/develop
 	uint8_t row,col;
 	if(BG == 1){
 		row = 46;
@@ -502,6 +563,7 @@ int32_t nrLDPC_decoder_LYC(t_nrLDPC_dec_params* p_decParams, int8_t* p_llr, int8
 		row = 42;
 		col = 52;
 	}
+<<<<<<< HEAD

 //	alloc memory
 	unsigned char *hard_decision = (unsigned char*)p_out;
@@ -510,6 +572,98 @@ int32_t nrLDPC_decoder_LYC(t_nrLDPC_dec_params* p_decParams, int8_t* p_llr, int8
 	cudaCheck( cudaMemcpyToSymbol(dev_const_llr, p_llr, memorySize_llr_cuda) );
 	cudaCheck( cudaMemcpyToSymbol(dev_llr, p_llr, memorySize_llr_cuda) );
 	
+=======
+	int compact_row = 30, compact_col = 19, lift_index=0;;
+	if(BG==2){compact_row = 10, compact_col = 23;}
+	
+	short lift_set[][9] = {
+		{2,4,8,16,32,64,128,256},
+		{3,6,12,24,48,96,192,384},
+		{5,10,20,40,80,160,320},
+		{7,14,28,56,112,224},
+		{9,18,36,72,144,288},
+		{11,22,44,88,176,352},
+		{13,26,52,104,208},
+		{15,30,60,120,240},
+		{0}
+	};
+	
+	for(int i = 0; lift_set[i][0] != 0; i++){
+		for(int j = 0; lift_set[i][j] != 0; j++){
+			if(Zc == lift_set[i][j]){
+				lift_index = i;
+				break;
+			}
+		}
+	}
+	
+	int *h = NULL;
+	switch(lift_index){
+		case 0:
+			h = (BG == 1)? h_base_0:h_base_8;
+			break;
+		case 1:
+			h = (BG == 1)? h_base_1:h_base_9;
+			break;
+		case 2:
+			h = (BG == 1)? h_base_2:h_base_10;
+			break;
+		case 3:
+			h = (BG == 1)? h_base_3:h_base_11;
+			break;
+		case 4:
+			h = (BG == 1)? h_base_4:h_base_12;
+			break;
+		case 5:
+			h = (BG == 1)? h_base_5:h_base_13;
+			break;
+		case 6:
+			h = (BG == 1)? h_base_6:h_base_14;
+			break;
+		case 7:
+			h = (BG == 1)? h_base_7:h_base_15;
+			break;
+	}
+	/* pack BG in compact graph */
+	read_BG(BG, h, row, col);
+	
+	
+	int memorySize_h_compact1 = row * compact_col * sizeof(h_element);
+	int memorySize_h_compact2 = compact_row * col * sizeof(h_element);
+
+//	cpu
+	int memorySize_hard_decision = col * Zc * sizeof(unsigned char) * MC;
+	
+	
+//	alloc memory
+	unsigned char *hard_decision = (unsigned char*)p_out;
+	
+//	gpu
+	int memorySize_llr_cuda = col * Zc * sizeof(char) * MC;
+	int memorySize_dt_cuda = row * Zc * col * sizeof(char) * MC;
+
+
+//	alloc memory
+	char *dev_llr;
+	char *dev_dt;
+	char *dev_const_llr;
+	unsigned char *dev_tmp;
+	
+	cudaCheck( cudaMalloc((void **)&dev_tmp, memorySize_hard_decision) );
+	cudaCheck( cudaMalloc((void **)&dev_llr, memorySize_llr_cuda) );
+	cudaCheck( cudaMalloc((void **)&dev_const_llr, memorySize_llr_cuda) );
+	cudaCheck( cudaMalloc((void **)&dev_dt, memorySize_dt_cuda) );
+	
+//	memcpy host to device	
+
+	cudaCheck( cudaMemcpyToSymbol(dev_h_compact1, h_compact1, memorySize_h_compact1) );
+	cudaCheck( cudaMemcpyToSymbol(dev_h_compact2, h_compact2, memorySize_h_compact2) );
+	cudaCheck( cudaMemcpy((void*)dev_const_llr, p_llr, memorySize_llr_cuda, cudaMemcpyHostToDevice) );
+start_meas(time_decoder);
+	cudaCheck( cudaMemcpy((void*)dev_llr, p_llr, memorySize_llr_cuda, cudaMemcpyHostToDevice) );
+	
+
+>>>>>>> origin/develop
 // Define CUDA kernel dimension
 	int blockSizeX = Zc;
 	dim3 dimGridKernel1(row, MC, 1); 	// dim of the thread blocks
@@ -518,14 +672,33 @@ int32_t nrLDPC_decoder_LYC(t_nrLDPC_dec_params* p_decParams, int8_t* p_llr, int8
    dim3 dimGridKernel2(col, MC, 1);
    dim3 dimBlockKernel2(blockSizeX, 1, 1);	
 	cudaDeviceSynchronize();
+<<<<<<< HEAD

 // lauch kernel 

+=======
+	
+	cudaEvent_t start, end;
+	float time;
+
+	warmup<<<dimGridKernel1, dimBlockKernel1>>>();
+	warmup<<<dimGridKernel2, dimBlockKernel2>>>();
+
+
+	cudaEventCreate(&start);
+	cudaEventCreate(&end);
+	cudaEventRecord(start, 0);
+	
+//	cudaProfilerStart();
+	
+// lauch kernel 
+>>>>>>> origin/develop
 	for(int ii = 0; ii < MAX_ITERATION; ii++){
 		// first kernel	
 		if(ii == 0){
 			ldpc_cnp_kernel_1st_iter 
 			<<<dimGridKernel1, dimBlockKernel1>>>
+<<<<<<< HEAD
 			(/*dev_llr,*/ BG, row, col, Zc);
 		}else{
 			ldpc_cnp_kernel
@@ -547,6 +720,43 @@ int32_t nrLDPC_decoder_LYC(t_nrLDPC_dec_params* p_decParams, int8_t* p_llr, int8
 	cudaDeviceSynchronize();
 	

+=======
+			(dev_llr, dev_dt, BG, row, col, Zc);
+		}else{
+			ldpc_cnp_kernel
+			<<<dimGridKernel1, dimBlockKernel1>>>
+			(dev_llr, dev_dt, BG, row, col, Zc);
+		}
+		
+		// second kernel
+		
+			ldpc_vnp_kernel_normal
+			<<<dimGridKernel2, dimBlockKernel2>>>
+			(dev_llr, dev_dt, dev_const_llr, BG, row, col, Zc);
+		
+	}
+	int pack = (block_length/128)+1;
+	dim3 pack_block(pack, MC, 1);
+	pack_decoded_bit<<<pack_block,128>>>(dev_llr, dev_tmp, col, Zc);
+
+
+	cudaEventRecord(end, 0);
+	cudaEventSynchronize(end);
+	cudaEventElapsedTime(&time, start, end);
+	
+
+	//cudaCheck( cudaMemcpy((*)hard_decision, (const void*)dev_tmp, memorySize_hard_decision, cudaMemcpyDeviceToHost) );
+	cudaCheck( cudaMemcpy((void*)hard_decision, (const void*)dev_tmp, (block_length/8)*sizeof(unsigned char), cudaMemcpyDeviceToHost) );
+	cudaDeviceSynchronize();
+stop_meas(time_decoder);
+	
+	cudaCheck( cudaFree(dev_llr) );
+	cudaCheck( cudaFree(dev_dt) );
+	cudaCheck( cudaFree(dev_const_llr) );
+	cudaCheck( cudaFree(dev_tmp) );
+	
+	//free(hard_decision);
+>>>>>>> origin/develop
 	return MAX_ITERATION;
 	
 }
--- a/openair1/PHY/CODING/nrLDPC_decoder_LYC/nrLDPC_decoder_LYC.h
+++ b/openair1/PHY/CODING/nrLDPC_decoder_LYC/nrLDPC_decoder_LYC.h
@@ -23,10 +23,13 @@

 int32_t nrLDPC_decoder_LYC(t_nrLDPC_dec_params* p_decParams, int8_t* p_llr, int8_t* p_out, int block_length, time_stats_t *time_decoder);

+<<<<<<< HEAD
 void init_LLR_DMA_for_CUDA(t_nrLDPC_dec_params* p_decParams, int8_t* p_llr, int8_t* p_out, int block_length);

 void warmup_for_GPU(void);

 void set_compact_BG(int Zc, short BG);

+=======
+>>>>>>> origin/develop
 #endif