fix compile issue

bc1c69fe · NCTU OpinConnect Terng-Yin Hsu/WEI-YING,LIN · 1fa6e0c6 · bc1c69fe · 1fa6e0c6
Commit bc1c69fe authored Aug 13, 2020 by NCTU OpinConnect Terng-Yin Hsu/WEI-YING,LIN
Hide whitespace changes
Inline Side-by-side

Showing with 2 additions and 474 deletions

cmake_targets/build_oai cmake_targets/build_oai +2 -2

patch.diff patch.diff +0 -472

No files found.
--- a/cmake_targets/build_oai
+++ b/cmake_targets/build_oai
@@ -711,8 +711,8 @@ function main() {
  if [ "$SIMUS_PHY" = "1" ] ; then
    echo_info "Compiling physical unitary tests simulators"
    # TODO: fix: dlsim_tm4 pucchsim prachsim pdcchsim pbchsim mbmssim
-    # simlist="dlsim ulsim ldpctest polartest smallblocktest nr_pbchsim nr_dlschsim nr_ulschsim nr_dlsim nr_ulsim nr_pucchsim nr_prachsim"
+    simlist="dlsim ulsim ldpctest polartest smallblocktest nr_pbchsim nr_dlschsim nr_ulschsim nr_dlsim nr_ulsim nr_pucchsim nr_prachsim"
-	simlist="ldpctest"
+	# simlist="ldpctest"
    for f in $simlist ; do
      compilations \
        phy_simulators $f \

--- a/patch.diff
+++ b/patch.diff
-diff --git a/cmake_targets/build_oai b/cmake_targets/build_oai
-index 88b8811..a7adc33 100755
--- a/cmake_targets/build_oai
-+++ b/cmake_targets/build_oai
-@@ -701,8 +701,8 @@ function main() {
-   if [ "$SIMUS_PHY" = "1" ] ; then
-     echo_info "Compiling physical unitary tests simulators"
-     # TODO: fix: dlsim_tm4 pucchsim prachsim pdcchsim pbchsim mbmssim
-    simlist="dlsim ulsim ldpctest polartest smallblocktest nr_pbchsim nr_dlschsim nr_ulschsim nr_dlsim nr_ulsim nr_pucchsim nr_prachsim"
-	# simlist="ldpctest"
-+    # simlist="dlsim ulsim ldpctest polartest smallblocktest nr_pbchsim nr_dlschsim nr_ulschsim nr_dlsim nr_ulsim nr_pucchsim nr_prachsim"
-+	simlist="ldpctest"
-     for f in $simlist ; do
-       compilations \
-         phy_simulators $f \
-diff --git a/openair1/PHY/CODING/TESTBENCH/ldpctest.c b/openair1/PHY/CODING/TESTBENCH/ldpctest.c
-index 345122c..72d85b1 100644
--- a/openair1/PHY/CODING/TESTBENCH/ldpctest.c
-+++ b/openair1/PHY/CODING/TESTBENCH/ldpctest.c
-@@ -396,17 +396,15 @@ int test_ldpc(short No_iteration,
-       decParams.numMaxIter=No_iteration;
-       decParams.outMode = nrLDPC_outMode_BIT;
-       //decParams.outMode =nrLDPC_outMode_LLRINT8;
-
-
-+	  set_compact_BG(Zc,BG);
-+	  init_LLR_DMA_for_CUDA(&decParams, (int8_t*)channel_output_fixed[j], (int8_t*)estimated_output[j], block_length);
-       for(j=0;j<n_segments;j++) {
-     	  start_meas(time_decoder);
- #ifdef CUDA_FLAG
-         if(run_cuda){
-          printf("***********run ldpc by cuda\n");
-           n_iter = nrLDPC_decoder_LYC(&decParams, (int8_t*)channel_output_fixed[j], (int8_t*)estimated_output[j], block_length, time_decoder);
-         }  
-         else{ 
-          printf("**************run ldpc by cpu\n");  
-         // decode the sequence
-         // decoder supports BG2, Z=128 & 256
-         //esimated_output=ldpc_decoder(channel_output_fixed, block_length, No_iteration, (double)((float)nom_rate/(float)denom_rate));
-@@ -516,6 +514,7 @@ int test_ldpc(short No_iteration,
- int main(int argc, char *argv[])
- {
-+  warmup_for_GPU();
-   unsigned int errors, errors_bit, crc_misses;
-   double errors_bit_uncoded;
-   short block_length=8448; // decoder supports length: 1201 -> 1280, 2401 -> 2560
-diff --git a/openair1/PHY/CODING/nrLDPC_decoder_LYC/nrLDPC_decoder_LYC.cu b/openair1/PHY/CODING/nrLDPC_decoder_LYC/nrLDPC_decoder_LYC.cu
-index 161b362..931d500 100644
--- a/openair1/PHY/CODING/nrLDPC_decoder_LYC/nrLDPC_decoder_LYC.cu
-+++ b/openair1/PHY/CODING/nrLDPC_decoder_LYC/nrLDPC_decoder_LYC.cu
-@@ -32,7 +32,7 @@
- #include "bgs/BG2_I6"
- #include "bgs/BG2_I7"
-#define MAX_ITERATION 5
-+#define MAX_ITERATION 2
- #define MC	1
- #define cudaCheck(ans) { cudaAssert((ans), __FILE__, __LINE__); }
-@@ -49,13 +49,21 @@ typedef struct{
-   char y;
-   short value;
- } h_element;
-+#include "bgs/BG1_compact_in_C.h"
-+__device__ char dev_const_llr[68*384];
-+__device__ char dev_dt [46*68*384];
-+__device__ char dev_llr[68*384];
-+__device__ unsigned char dev_tmp[68*384];
- h_element h_compact1 [46*19] = {};
- h_element h_compact2 [68*30] = {};
-__device__ __constant__ h_element dev_h_compact1[46*19];  // used in kernel 1
-__device__ __constant__ h_element dev_h_compact2[68*30];  // used in kernel 2
-+__device__  h_element dev_h_compact1[46*19];  // used in kernel 1
-+__device__  h_element dev_h_compact2[68*30];  // used in kernel 2
-+
-+// __device__ __constant__ h_element dev_h_compact1[46*19];  // used in kernel 1
-+// __device__ __constant__ h_element dev_h_compact2[68*30];  // used in kernel 2
- // row and col element count
- __device__ __constant__ char h_ele_row_bg1_count[46] = {
-@@ -92,9 +100,93 @@ __global__ void warmup()
- 	// warm up gpu for time measurement
- }
-+extern "C"
-+void warmup_for_GPU(){
-+	
-+	warmup<<<20,1024 >>>();
-+
-+}
-+
-+extern "C"
-+void set_compact_BG(int Zc,short BG){
-+	
-+	int row,col;
-+	if(BG == 1){
-+		row = 46;
-+		col = 68;
-+	}
-+	else{
-+		row = 42;
-+		col = 52;
-+	}
-+	int compact_row = 30; 
-+	int compact_col = 19;
-+	if(BG==2){compact_row = 10, compact_col = 23;}
-+	int memorySize_h_compact1 = row * compact_col * sizeof(h_element);
-+	int memorySize_h_compact2 = compact_row * col * sizeof(h_element);
-+	int lift_index = 0;
-+	short lift_set[][9] = {
-+		{2,4,8,16,32,64,128,256},
-+		{3,6,12,24,48,96,192,384},
-+		{5,10,20,40,80,160,320},
-+		{7,14,28,56,112,224},
-+		{9,18,36,72,144,288},
-+		{11,22,44,88,176,352},
-+		{13,26,52,104,208},
-+		{15,30,60,120,240},
-+		{0}
-+	};
-+	
-+	for(int i = 0; lift_set[i][0] != 0; i++){
-+		for(int j = 0; lift_set[i][j] != 0; j++){
-+			if(Zc == lift_set[i][j]){
-+				lift_index = i;
-+				break;
-+			}
-+		}
-+	}
-+	printf("\nZc = %d BG = %d\n",Zc,BG);
-+	switch(lift_index){
-+			case 0:
-+				cudaCheck( cudaMemcpyToSymbol(dev_h_compact1, host_h_compact1_I0, memorySize_h_compact1) );
-+				cudaCheck( cudaMemcpyToSymbol(dev_h_compact2, host_h_compact2_I0, memorySize_h_compact2) );
-+				break;
-+			case 1:
-+				cudaCheck( cudaMemcpyToSymbol(dev_h_compact1, host_h_compact1_I1, memorySize_h_compact1) );
-+				cudaCheck( cudaMemcpyToSymbol(dev_h_compact2, host_h_compact2_I1, memorySize_h_compact2) );
-+				break;
-+			case 2:
-+				cudaCheck( cudaMemcpyToSymbol(dev_h_compact1, host_h_compact1_I2, memorySize_h_compact1) );
-+				cudaCheck( cudaMemcpyToSymbol(dev_h_compact2, host_h_compact2_I2, memorySize_h_compact2) );
-+				break;
-+			case 3:
-+				cudaCheck( cudaMemcpyToSymbol(dev_h_compact1, host_h_compact1_I3, memorySize_h_compact1) );
-+				cudaCheck( cudaMemcpyToSymbol(dev_h_compact2, host_h_compact2_I3, memorySize_h_compact2) );
-+				break;
-+			case 4:
-+				cudaCheck( cudaMemcpyToSymbol(dev_h_compact1, host_h_compact1_I4, memorySize_h_compact1) );
-+				cudaCheck( cudaMemcpyToSymbol(dev_h_compact2, host_h_compact2_I4, memorySize_h_compact2) );
-+				break;
-+			case 5:
-+				cudaCheck( cudaMemcpyToSymbol(dev_h_compact1, host_h_compact1_I5, memorySize_h_compact1) );
-+				cudaCheck( cudaMemcpyToSymbol(dev_h_compact2, host_h_compact2_I5, memorySize_h_compact2) );
-+				break;
-+			case 6:
-+				cudaCheck( cudaMemcpyToSymbol(dev_h_compact1, host_h_compact1_I6, memorySize_h_compact1) );
-+				cudaCheck( cudaMemcpyToSymbol(dev_h_compact2, host_h_compact2_I6, memorySize_h_compact2) );
-+				break;
-+			case 7:
-+				cudaCheck( cudaMemcpyToSymbol(dev_h_compact1, host_h_compact1_I7, memorySize_h_compact1) );
-+				cudaCheck( cudaMemcpyToSymbol(dev_h_compact2, host_h_compact2_I7, memorySize_h_compact2) );
-+				break;
-+		}
-+	
-+	// return 0;
-+}
-+
- // Kernel 1
-__global__ void ldpc_cnp_kernel_1st_iter(char * dev_llr, char * dev_dt, int BG, int row, int col, int Zc)
-+__global__ void ldpc_cnp_kernel_1st_iter(/*char * dev_llr,*/ int BG, int row, int col, int Zc)
- {
- //	if(blockIdx.x == 0 && threadIdx.x == 1) printf("cnp %d\n", threadIdx.x);
- 	int iMCW = blockIdx.y;		// codeword id
-@@ -153,7 +245,7 @@ __global__ void ldpc_cnp_kernel_1st_iter(char * dev_llr, char * dev_dt, int BG,
- 	for(int i = 0; i < s; i++){
- 		// v0: Best performance so far. 0.75f is the value of alpha.
- 		sq = 1 - 2 * ((Q_sign >> i) & 0x01);
-		R_temp = 0.8 * sign * sq * (i != idx_min ? rmin1 : rmin2);
-+		R_temp = 0.75f * sign * sq * (i != idx_min ? rmin1 : rmin2);
- 		// write results to global memory
- 		h_element_t = dev_h_compact1[i*row+iBlkRow];
- 		int addr_temp = offsetR + h_element_t.y * row * Zc;
-@@ -163,7 +255,7 @@ __global__ void ldpc_cnp_kernel_1st_iter(char * dev_llr, char * dev_dt, int BG,
- }
- // Kernel_1
-__global__ void ldpc_cnp_kernel(char * dev_llr, char * dev_dt, int BG, int row, int col, int Zc)
-+__global__ void ldpc_cnp_kernel(/*char * dev_llr, char * dev_dt,*/ int BG, int row, int col, int Zc)
- {
- //	if(blockIdx.x == 0 && threadIdx.x == 1) printf("cnp\n");
- 	int iMCW = blockIdx.y;
-@@ -223,7 +315,7 @@ __global__ void ldpc_cnp_kernel(char * dev_llr, char * dev_dt, int BG, int row,
- //	The 2nd recursion
- 	for(int i = 0; i < s; i ++){
- 		sq = 1 - 2 * ((Q_sign >> i) & 0x01);
-		R_temp = 0.8 * sign * sq * (i != idx_min ? rmin1 : rmin2);
-+		R_temp = 0.75f * sign * sq * (i != idx_min ? rmin1 : rmin2);
- 		// write results to global memory
-@@ -236,7 +328,7 @@ __global__ void ldpc_cnp_kernel(char * dev_llr, char * dev_dt, int BG, int row,
- // Kernel 2: VNP processing
- __global__ void
-ldpc_vnp_kernel_normal(char * dev_llr, char * dev_dt, char * dev_const_llr, int BG, int row, int col, int Zc)
-+ldpc_vnp_kernel_normal(/*char * dev_llr, char * dev_dt, char * dev_const_llr,*/ int BG, int row, int col, int Zc)
- {	
- 	int iMCW = blockIdx.y;
- 	int iBlkCol = blockIdx.x;
-@@ -276,7 +368,7 @@ ldpc_vnp_kernel_normal(char * dev_llr, char * dev_dt, char * dev_const_llr, int
- }
-__global__ void pack_decoded_bit(char *dev, unsigned char *host, int col, int Zc)
-+__global__ void pack_decoded_bit(/*char *dev, unsigned char *host,*/ int col, int Zc)
- {
- 	__shared__ unsigned char tmp[128];
- 	int iMCW = blockIdx.y;
-@@ -284,15 +376,15 @@ __global__ void pack_decoded_bit(char *dev, unsigned char *host, int col, int Zc
- 	int btid = threadIdx.x;
- 	tmp[btid] = 0;
-	if(dev[tid] < 0){
-+	if(dev_llr[tid] < 0){
- 		tmp[btid] = 1 << (7-(btid&7));
- 	}
- 	__syncthreads();
- 	if(threadIdx.x < 16){
-		host[iMCW * col*Zc + blockIdx.x*16+threadIdx.x] = 0;
-+		dev_tmp[iMCW * col*Zc + blockIdx.x*16+threadIdx.x] = 0;
- 		for(int i = 0; i < 8; i++){
-			host[iMCW * col*Zc + blockIdx.x*16+threadIdx.x] += tmp[threadIdx.x*8+i];
-+			dev_tmp[iMCW * col*Zc + blockIdx.x*16+threadIdx.x] += tmp[threadIdx.x*8+i];
- 		}
- 	}
- }
-@@ -369,18 +461,38 @@ void read_BG(int BG, int *h, int row, int col)
- 	*/
- }
-+extern "C"
-+void init_LLR_DMA_for_CUDA(t_nrLDPC_dec_params* p_decParams, int8_t* p_llr, int8_t* p_out, int block_length){
-+	
-+	uint16_t Zc          = p_decParams->Z;
-+    uint8_t  BG         = p_decParams->BG;
-+	uint8_t row,col;
-+	if(BG == 1){
-+		row = 46;
-+		col = 68;
-+	}
-+	else{
-+		row = 42;
-+		col = 52;
-+	}
-+	unsigned char *hard_decision = (unsigned char*)p_out;
-+	int memorySize_llr_cuda = col * Zc * sizeof(char) * MC;
-+	cudaCheck( cudaMemcpyToSymbol(dev_const_llr, p_llr, memorySize_llr_cuda) );
-+	cudaCheck( cudaMemcpyToSymbol(dev_llr, p_llr, memorySize_llr_cuda) );
-+	cudaDeviceSynchronize();
-+	
-+}
- extern "C"
- int32_t nrLDPC_decoder_LYC(t_nrLDPC_dec_params* p_decParams, int8_t* p_llr, int8_t* p_out, int block_length, time_stats_t *time_decoder)
- {
-	// alloc mem
-	//unsigned char *decision = (unsigned char*)p_out;
-+
-     uint16_t Zc          = p_decParams->Z;
-     uint8_t  BG         = p_decParams->BG;
-     uint8_t  numMaxIter = p_decParams->numMaxIter;
-     e_nrLDPC_outMode outMode = p_decParams->outMode;
-	
-+	cudaError_t cudaStatus;
- 	uint8_t row,col;
- 	if(BG == 1){
- 		row = 46;
-@@ -390,96 +502,14 @@ int32_t nrLDPC_decoder_LYC(t_nrLDPC_dec_params* p_decParams, int8_t* p_llr, int8
- 		row = 42;
- 		col = 52;
- 	}
-	int compact_row = 30, compact_col = 19, lift_index=0;;
-	if(BG==2){compact_row = 10, compact_col = 23;}
-	
-	short lift_set[][9] = {
-		{2,4,8,16,32,64,128,256},
-		{3,6,12,24,48,96,192,384},
-		{5,10,20,40,80,160,320},
-		{7,14,28,56,112,224},
-		{9,18,36,72,144,288},
-		{11,22,44,88,176,352},
-		{13,26,52,104,208},
-		{15,30,60,120,240},
-		{0}
-	};
-	
-	for(int i = 0; lift_set[i][0] != 0; i++){
-		for(int j = 0; lift_set[i][j] != 0; j++){
-			if(Zc == lift_set[i][j]){
-				lift_index = i;
-				break;
-			}
-		}
-	}
-	
-	int *h = NULL;
-	switch(lift_index){
-		case 0:
-			h = (BG == 1)? h_base_0:h_base_8;
-			break;
-		case 1:
-			h = (BG == 1)? h_base_1:h_base_9;
-			break;
-		case 2:
-			h = (BG == 1)? h_base_2:h_base_10;
-			break;
-		case 3:
-			h = (BG == 1)? h_base_3:h_base_11;
-			break;
-		case 4:
-			h = (BG == 1)? h_base_4:h_base_12;
-			break;
-		case 5:
-			h = (BG == 1)? h_base_5:h_base_13;
-			break;
-		case 6:
-			h = (BG == 1)? h_base_6:h_base_14;
-			break;
-		case 7:
-			h = (BG == 1)? h_base_7:h_base_15;
-			break;
-	}
-	/* pack BG in compact graph */
-	read_BG(BG, h, row, col);
-	
-	
-	int memorySize_h_compact1 = row * compact_col * sizeof(h_element);
-	int memorySize_h_compact2 = compact_row * col * sizeof(h_element);
-//	cpu
-	int memorySize_hard_decision = col * Zc * sizeof(unsigned char) * MC;
-	
-	
- //	alloc memory
- 	unsigned char *hard_decision = (unsigned char*)p_out;
-	
- //	gpu
- 	int memorySize_llr_cuda = col * Zc * sizeof(char) * MC;
-	int memorySize_dt_cuda = row * Zc * col * sizeof(char) * MC;
-
-
-//	alloc memory
-	char *dev_llr;
-	char *dev_dt;
-	char *dev_const_llr;
-	unsigned char *dev_tmp;
-+	cudaCheck( cudaMemcpyToSymbol(dev_const_llr, p_llr, memorySize_llr_cuda) );
-+	cudaCheck( cudaMemcpyToSymbol(dev_llr, p_llr, memorySize_llr_cuda) );
-	cudaCheck( cudaMalloc((void **)&dev_tmp, memorySize_hard_decision) );
-	cudaCheck( cudaMalloc((void **)&dev_llr, memorySize_llr_cuda) );
-	cudaCheck( cudaMalloc((void **)&dev_const_llr, memorySize_llr_cuda) );
-	cudaCheck( cudaMalloc((void **)&dev_dt, memorySize_dt_cuda) );
-	
-//	memcpy host to device	
-
-	cudaCheck( cudaMemcpyToSymbol(dev_h_compact1, h_compact1, memorySize_h_compact1) );
-	cudaCheck( cudaMemcpyToSymbol(dev_h_compact2, h_compact2, memorySize_h_compact2) );
-	cudaCheck( cudaMemcpy((void*)dev_const_llr, p_llr, memorySize_llr_cuda, cudaMemcpyHostToDevice) );
-start_meas(time_decoder);
-	cudaCheck( cudaMemcpy((void*)dev_llr, p_llr, memorySize_llr_cuda, cudaMemcpyHostToDevice) );
-	
-
- // Define CUDA kernel dimension
- 	int blockSizeX = Zc;
- 	dim3 dimGridKernel1(row, MC, 1); 	// dim of the thread blocks
-@@ -488,61 +518,35 @@ start_meas(time_decoder);
-     dim3 dimGridKernel2(col, MC, 1);
-     dim3 dimBlockKernel2(blockSizeX, 1, 1);	
- 	cudaDeviceSynchronize();
-	
-	cudaEvent_t start, end;
-	float time;
-
-	warmup<<<dimGridKernel1, dimBlockKernel1>>>();
-	warmup<<<dimGridKernel2, dimBlockKernel2>>>();
-
-	cudaEventCreate(&start);
-	cudaEventCreate(&end);
-	cudaEventRecord(start, 0);
-	
-//	cudaProfilerStart();
-	
- // lauch kernel 
-+
- 	for(int ii = 0; ii < MAX_ITERATION; ii++){
- 		// first kernel	
- 		if(ii == 0){
- 			ldpc_cnp_kernel_1st_iter 
- 			<<<dimGridKernel1, dimBlockKernel1>>>
-			(dev_llr, dev_dt, BG, row, col, Zc);
-+			(/*dev_llr,*/ BG, row, col, Zc);
- 		}else{
- 			ldpc_cnp_kernel
- 			<<<dimGridKernel1, dimBlockKernel1>>>
-			(dev_llr, dev_dt, BG, row, col, Zc);
-+			(/*dev_llr,*/ BG, row, col, Zc);
- 		}
-		
- 		// second kernel
-		
-			ldpc_vnp_kernel_normal
-			<<<dimGridKernel2, dimBlockKernel2>>>
-			(dev_llr, dev_dt, dev_const_llr, BG, row, col, Zc);
-		
-+		ldpc_vnp_kernel_normal
-+		<<<dimGridKernel2, dimBlockKernel2>>>
-+		// (dev_llr, dev_const_llr,BG, row, col, Zc);
-+		(BG, row, col, Zc);
- 	}
-+	
- 	int pack = (block_length/128)+1;
- 	dim3 pack_block(pack, MC, 1);
-	pack_decoded_bit<<<pack_block,128>>>(dev_llr, dev_tmp, col, Zc);
-
-
-	cudaEventRecord(end, 0);
-	cudaEventSynchronize(end);
-	cudaEventElapsedTime(&time, start, end);
-+	pack_decoded_bit<<<pack_block,128>>>(/*dev_llr,*/ /*dev_tmp,*/ col, Zc);
-
-	//cudaCheck( cudaMemcpy((*)hard_decision, (const void*)dev_tmp, memorySize_hard_decision, cudaMemcpyDeviceToHost) );
-	cudaCheck( cudaMemcpy((void*)hard_decision, (const void*)dev_tmp, (block_length/8)*sizeof(unsigned char), cudaMemcpyDeviceToHost) );
-+	cudaCheck( cudaMemcpyFromSymbol((void*)hard_decision, (const void*)dev_tmp, (block_length/8)*sizeof(unsigned char)) );
- 	cudaDeviceSynchronize();
-stop_meas(time_decoder);
-	
-	cudaCheck( cudaFree(dev_llr) );
-	cudaCheck( cudaFree(dev_dt) );
-	cudaCheck( cudaFree(dev_const_llr) );
-	cudaCheck( cudaFree(dev_tmp) );
-	//free(hard_decision);
-+
- 	return MAX_ITERATION;
- }
-diff --git a/openair1/PHY/CODING/nrLDPC_decoder_LYC/nrLDPC_decoder_LYC.h b/openair1/PHY/CODING/nrLDPC_decoder_LYC/nrLDPC_decoder_LYC.h
-index 0e78f98..c8868f0 100644
--- a/openair1/PHY/CODING/nrLDPC_decoder_LYC/nrLDPC_decoder_LYC.h
-+++ b/openair1/PHY/CODING/nrLDPC_decoder_LYC/nrLDPC_decoder_LYC.h
-@@ -23,4 +23,10 @@
- int32_t nrLDPC_decoder_LYC(t_nrLDPC_dec_params* p_decParams, int8_t* p_llr, int8_t* p_out, int block_length, time_stats_t *time_decoder);
-+void init_LLR_DMA_for_CUDA(t_nrLDPC_dec_params* p_decParams, int8_t* p_llr, int8_t* p_out, int block_length);
-+
-+void warmup_for_GPU(void);
-+
-+void set_compact_BG(int Zc, short BG);
-+
- #endif