diff --git a/openair1/PHY/CODING/TESTBENCH/ldpctest.c b/openair1/PHY/CODING/TESTBENCH/ldpctest.c index 72d85b1b142330618fefe963e69158a73c0c9ab2..e67ed222eee9b97c8b0538804ba376a175fdfe79 100644 --- a/openair1/PHY/CODING/TESTBENCH/ldpctest.c +++ b/openair1/PHY/CODING/TESTBENCH/ldpctest.c @@ -402,9 +402,17 @@ int test_ldpc(short No_iteration, start_meas(time_decoder); #ifdef CUDA_FLAG if(run_cuda){ +<<<<<<< HEAD n_iter = nrLDPC_decoder_LYC(&decParams, (int8_t*)channel_output_fixed[j], (int8_t*)estimated_output[j], block_length, time_decoder); } else{ +======= + printf("***********run ldpc by cuda\n"); + n_iter = nrLDPC_decoder_LYC(&decParams, (int8_t*)channel_output_fixed[j], (int8_t*)estimated_output[j], block_length, time_decoder); + } + else{ + printf("**************run ldpc by cpu\n"); +>>>>>>> origin/develop // decode the sequence // decoder supports BG2, Z=128 & 256 //esimated_output=ldpc_decoder(channel_output_fixed, block_length, No_iteration, (double)((float)nom_rate/(float)denom_rate)); diff --git a/openair1/PHY/CODING/nrLDPC_decoder_LYC/nrLDPC_decoder_LYC.cu b/openair1/PHY/CODING/nrLDPC_decoder_LYC/nrLDPC_decoder_LYC.cu index 931d5003385af8b4144fdbef244ed2176272a90e..58c6afbe6e349b91fc414c5e0c5ecbc73e60830e 100644 --- a/openair1/PHY/CODING/nrLDPC_decoder_LYC/nrLDPC_decoder_LYC.cu +++ b/openair1/PHY/CODING/nrLDPC_decoder_LYC/nrLDPC_decoder_LYC.cu @@ -32,7 +32,11 @@ #include "bgs/BG2_I6" #include "bgs/BG2_I7" +<<<<<<< HEAD #define MAX_ITERATION 2 +======= +#define MAX_ITERATION 5 +>>>>>>> origin/develop #define MC 1 #define cudaCheck(ans) { cudaAssert((ans), __FILE__, __LINE__); } @@ -49,21 +53,30 @@ typedef struct{ char y; short value; } h_element; +<<<<<<< HEAD #include "bgs/BG1_compact_in_C.h" __device__ char dev_const_llr[68*384]; __device__ char dev_dt [46*68*384]; __device__ char dev_llr[68*384]; __device__ unsigned char dev_tmp[68*384]; +======= + +>>>>>>> origin/develop h_element h_compact1 [46*19] = {}; h_element h_compact2 [68*30] = {}; +<<<<<<< HEAD __device__ h_element dev_h_compact1[46*19]; // used in kernel 1 __device__ h_element dev_h_compact2[68*30]; // used in kernel 2 // __device__ __constant__ h_element dev_h_compact1[46*19]; // used in kernel 1 // __device__ __constant__ h_element dev_h_compact2[68*30]; // used in kernel 2 +======= +__device__ __constant__ h_element dev_h_compact1[46*19]; // used in kernel 1 +__device__ __constant__ h_element dev_h_compact2[68*30]; // used in kernel 2 +>>>>>>> origin/develop // row and col element count __device__ __constant__ char h_ele_row_bg1_count[46] = { @@ -100,6 +113,7 @@ __global__ void warmup() // warm up gpu for time measurement } +<<<<<<< HEAD extern "C" void warmup_for_GPU(){ @@ -187,6 +201,11 @@ void set_compact_BG(int Zc,short BG){ // Kernel 1 __global__ void ldpc_cnp_kernel_1st_iter(/*char * dev_llr,*/ int BG, int row, int col, int Zc) +======= + +// Kernel 1 +__global__ void ldpc_cnp_kernel_1st_iter(char * dev_llr, char * dev_dt, int BG, int row, int col, int Zc) +>>>>>>> origin/develop { // if(blockIdx.x == 0 && threadIdx.x == 1) printf("cnp %d\n", threadIdx.x); int iMCW = blockIdx.y; // codeword id @@ -245,7 +264,11 @@ __global__ void ldpc_cnp_kernel_1st_iter(/*char * dev_llr,*/ int BG, int row, in for(int i = 0; i < s; i++){ // v0: Best performance so far. 0.75f is the value of alpha. sq = 1 - 2 * ((Q_sign >> i) & 0x01); +<<<<<<< HEAD R_temp = 0.75f * sign * sq * (i != idx_min ? rmin1 : rmin2); +======= + R_temp = 0.8 * sign * sq * (i != idx_min ? rmin1 : rmin2); +>>>>>>> origin/develop // write results to global memory h_element_t = dev_h_compact1[i*row+iBlkRow]; int addr_temp = offsetR + h_element_t.y * row * Zc; @@ -255,7 +278,11 @@ __global__ void ldpc_cnp_kernel_1st_iter(/*char * dev_llr,*/ int BG, int row, in } // Kernel_1 +<<<<<<< HEAD __global__ void ldpc_cnp_kernel(/*char * dev_llr, char * dev_dt,*/ int BG, int row, int col, int Zc) +======= +__global__ void ldpc_cnp_kernel(char * dev_llr, char * dev_dt, int BG, int row, int col, int Zc) +>>>>>>> origin/develop { // if(blockIdx.x == 0 && threadIdx.x == 1) printf("cnp\n"); int iMCW = blockIdx.y; @@ -315,7 +342,11 @@ __global__ void ldpc_cnp_kernel(/*char * dev_llr, char * dev_dt,*/ int BG, int r // The 2nd recursion for(int i = 0; i < s; i ++){ sq = 1 - 2 * ((Q_sign >> i) & 0x01); +<<<<<<< HEAD R_temp = 0.75f * sign * sq * (i != idx_min ? rmin1 : rmin2); +======= + R_temp = 0.8 * sign * sq * (i != idx_min ? rmin1 : rmin2); +>>>>>>> origin/develop // write results to global memory @@ -328,7 +359,11 @@ __global__ void ldpc_cnp_kernel(/*char * dev_llr, char * dev_dt,*/ int BG, int r // Kernel 2: VNP processing __global__ void +<<<<<<< HEAD ldpc_vnp_kernel_normal(/*char * dev_llr, char * dev_dt, char * dev_const_llr,*/ int BG, int row, int col, int Zc) +======= +ldpc_vnp_kernel_normal(char * dev_llr, char * dev_dt, char * dev_const_llr, int BG, int row, int col, int Zc) +>>>>>>> origin/develop { int iMCW = blockIdx.y; int iBlkCol = blockIdx.x; @@ -368,7 +403,11 @@ ldpc_vnp_kernel_normal(/*char * dev_llr, char * dev_dt, char * dev_const_llr,*/ } +<<<<<<< HEAD __global__ void pack_decoded_bit(/*char *dev, unsigned char *host,*/ int col, int Zc) +======= +__global__ void pack_decoded_bit(char *dev, unsigned char *host, int col, int Zc) +>>>>>>> origin/develop { __shared__ unsigned char tmp[128]; int iMCW = blockIdx.y; @@ -376,15 +415,25 @@ __global__ void pack_decoded_bit(/*char *dev, unsigned char *host,*/ int col, in int btid = threadIdx.x; tmp[btid] = 0; +<<<<<<< HEAD if(dev_llr[tid] < 0){ +======= + if(dev[tid] < 0){ +>>>>>>> origin/develop tmp[btid] = 1 << (7-(btid&7)); } __syncthreads(); if(threadIdx.x < 16){ +<<<<<<< HEAD dev_tmp[iMCW * col*Zc + blockIdx.x*16+threadIdx.x] = 0; for(int i = 0; i < 8; i++){ dev_tmp[iMCW * col*Zc + blockIdx.x*16+threadIdx.x] += tmp[threadIdx.x*8+i]; +======= + host[iMCW * col*Zc + blockIdx.x*16+threadIdx.x] = 0; + for(int i = 0; i < 8; i++){ + host[iMCW * col*Zc + blockIdx.x*16+threadIdx.x] += tmp[threadIdx.x*8+i]; +>>>>>>> origin/develop } } } @@ -461,6 +510,7 @@ void read_BG(int BG, int *h, int row, int col) */ } +<<<<<<< HEAD extern "C" void init_LLR_DMA_for_CUDA(t_nrLDPC_dec_params* p_decParams, int8_t* p_llr, int8_t* p_out, int block_length){ @@ -482,17 +532,28 @@ void init_LLR_DMA_for_CUDA(t_nrLDPC_dec_params* p_decParams, int8_t* p_llr, int8 cudaDeviceSynchronize(); } +======= +>>>>>>> origin/develop extern "C" int32_t nrLDPC_decoder_LYC(t_nrLDPC_dec_params* p_decParams, int8_t* p_llr, int8_t* p_out, int block_length, time_stats_t *time_decoder) { +<<<<<<< HEAD +======= + // alloc mem + //unsigned char *decision = (unsigned char*)p_out; +>>>>>>> origin/develop uint16_t Zc = p_decParams->Z; uint8_t BG = p_decParams->BG; uint8_t numMaxIter = p_decParams->numMaxIter; e_nrLDPC_outMode outMode = p_decParams->outMode; +<<<<<<< HEAD cudaError_t cudaStatus; +======= + +>>>>>>> origin/develop uint8_t row,col; if(BG == 1){ row = 46; @@ -502,6 +563,7 @@ int32_t nrLDPC_decoder_LYC(t_nrLDPC_dec_params* p_decParams, int8_t* p_llr, int8 row = 42; col = 52; } +<<<<<<< HEAD // alloc memory unsigned char *hard_decision = (unsigned char*)p_out; @@ -510,6 +572,98 @@ int32_t nrLDPC_decoder_LYC(t_nrLDPC_dec_params* p_decParams, int8_t* p_llr, int8 cudaCheck( cudaMemcpyToSymbol(dev_const_llr, p_llr, memorySize_llr_cuda) ); cudaCheck( cudaMemcpyToSymbol(dev_llr, p_llr, memorySize_llr_cuda) ); +======= + int compact_row = 30, compact_col = 19, lift_index=0;; + if(BG==2){compact_row = 10, compact_col = 23;} + + short lift_set[][9] = { + {2,4,8,16,32,64,128,256}, + {3,6,12,24,48,96,192,384}, + {5,10,20,40,80,160,320}, + {7,14,28,56,112,224}, + {9,18,36,72,144,288}, + {11,22,44,88,176,352}, + {13,26,52,104,208}, + {15,30,60,120,240}, + {0} + }; + + for(int i = 0; lift_set[i][0] != 0; i++){ + for(int j = 0; lift_set[i][j] != 0; j++){ + if(Zc == lift_set[i][j]){ + lift_index = i; + break; + } + } + } + + int *h = NULL; + switch(lift_index){ + case 0: + h = (BG == 1)? h_base_0:h_base_8; + break; + case 1: + h = (BG == 1)? h_base_1:h_base_9; + break; + case 2: + h = (BG == 1)? h_base_2:h_base_10; + break; + case 3: + h = (BG == 1)? h_base_3:h_base_11; + break; + case 4: + h = (BG == 1)? h_base_4:h_base_12; + break; + case 5: + h = (BG == 1)? h_base_5:h_base_13; + break; + case 6: + h = (BG == 1)? h_base_6:h_base_14; + break; + case 7: + h = (BG == 1)? h_base_7:h_base_15; + break; + } + /* pack BG in compact graph */ + read_BG(BG, h, row, col); + + + int memorySize_h_compact1 = row * compact_col * sizeof(h_element); + int memorySize_h_compact2 = compact_row * col * sizeof(h_element); + +// cpu + int memorySize_hard_decision = col * Zc * sizeof(unsigned char) * MC; + + +// alloc memory + unsigned char *hard_decision = (unsigned char*)p_out; + +// gpu + int memorySize_llr_cuda = col * Zc * sizeof(char) * MC; + int memorySize_dt_cuda = row * Zc * col * sizeof(char) * MC; + + +// alloc memory + char *dev_llr; + char *dev_dt; + char *dev_const_llr; + unsigned char *dev_tmp; + + cudaCheck( cudaMalloc((void **)&dev_tmp, memorySize_hard_decision) ); + cudaCheck( cudaMalloc((void **)&dev_llr, memorySize_llr_cuda) ); + cudaCheck( cudaMalloc((void **)&dev_const_llr, memorySize_llr_cuda) ); + cudaCheck( cudaMalloc((void **)&dev_dt, memorySize_dt_cuda) ); + +// memcpy host to device + + cudaCheck( cudaMemcpyToSymbol(dev_h_compact1, h_compact1, memorySize_h_compact1) ); + cudaCheck( cudaMemcpyToSymbol(dev_h_compact2, h_compact2, memorySize_h_compact2) ); + cudaCheck( cudaMemcpy((void*)dev_const_llr, p_llr, memorySize_llr_cuda, cudaMemcpyHostToDevice) ); +start_meas(time_decoder); + cudaCheck( cudaMemcpy((void*)dev_llr, p_llr, memorySize_llr_cuda, cudaMemcpyHostToDevice) ); + + +>>>>>>> origin/develop // Define CUDA kernel dimension int blockSizeX = Zc; dim3 dimGridKernel1(row, MC, 1); // dim of the thread blocks @@ -518,14 +672,33 @@ int32_t nrLDPC_decoder_LYC(t_nrLDPC_dec_params* p_decParams, int8_t* p_llr, int8 dim3 dimGridKernel2(col, MC, 1); dim3 dimBlockKernel2(blockSizeX, 1, 1); cudaDeviceSynchronize(); +<<<<<<< HEAD // lauch kernel +======= + + cudaEvent_t start, end; + float time; + + warmup<<<dimGridKernel1, dimBlockKernel1>>>(); + warmup<<<dimGridKernel2, dimBlockKernel2>>>(); + + + cudaEventCreate(&start); + cudaEventCreate(&end); + cudaEventRecord(start, 0); + +// cudaProfilerStart(); + +// lauch kernel +>>>>>>> origin/develop for(int ii = 0; ii < MAX_ITERATION; ii++){ // first kernel if(ii == 0){ ldpc_cnp_kernel_1st_iter <<<dimGridKernel1, dimBlockKernel1>>> +<<<<<<< HEAD (/*dev_llr,*/ BG, row, col, Zc); }else{ ldpc_cnp_kernel @@ -547,6 +720,43 @@ int32_t nrLDPC_decoder_LYC(t_nrLDPC_dec_params* p_decParams, int8_t* p_llr, int8 cudaDeviceSynchronize(); +======= + (dev_llr, dev_dt, BG, row, col, Zc); + }else{ + ldpc_cnp_kernel + <<<dimGridKernel1, dimBlockKernel1>>> + (dev_llr, dev_dt, BG, row, col, Zc); + } + + // second kernel + + ldpc_vnp_kernel_normal + <<<dimGridKernel2, dimBlockKernel2>>> + (dev_llr, dev_dt, dev_const_llr, BG, row, col, Zc); + + } + int pack = (block_length/128)+1; + dim3 pack_block(pack, MC, 1); + pack_decoded_bit<<<pack_block,128>>>(dev_llr, dev_tmp, col, Zc); + + + cudaEventRecord(end, 0); + cudaEventSynchronize(end); + cudaEventElapsedTime(&time, start, end); + + + //cudaCheck( cudaMemcpy((*)hard_decision, (const void*)dev_tmp, memorySize_hard_decision, cudaMemcpyDeviceToHost) ); + cudaCheck( cudaMemcpy((void*)hard_decision, (const void*)dev_tmp, (block_length/8)*sizeof(unsigned char), cudaMemcpyDeviceToHost) ); + cudaDeviceSynchronize(); +stop_meas(time_decoder); + + cudaCheck( cudaFree(dev_llr) ); + cudaCheck( cudaFree(dev_dt) ); + cudaCheck( cudaFree(dev_const_llr) ); + cudaCheck( cudaFree(dev_tmp) ); + + //free(hard_decision); +>>>>>>> origin/develop return MAX_ITERATION; } diff --git a/openair1/PHY/CODING/nrLDPC_decoder_LYC/nrLDPC_decoder_LYC.h b/openair1/PHY/CODING/nrLDPC_decoder_LYC/nrLDPC_decoder_LYC.h index c8868f0a8565df37c0ef32c94f2128167205652a..e82c81d28edf0c60de242218e2b3900cdf3d06b2 100644 --- a/openair1/PHY/CODING/nrLDPC_decoder_LYC/nrLDPC_decoder_LYC.h +++ b/openair1/PHY/CODING/nrLDPC_decoder_LYC/nrLDPC_decoder_LYC.h @@ -23,10 +23,13 @@ int32_t nrLDPC_decoder_LYC(t_nrLDPC_dec_params* p_decParams, int8_t* p_llr, int8_t* p_out, int block_length, time_stats_t *time_decoder); +<<<<<<< HEAD void init_LLR_DMA_for_CUDA(t_nrLDPC_dec_params* p_decParams, int8_t* p_llr, int8_t* p_out, int block_length); void warmup_for_GPU(void); void set_compact_BG(int Zc, short BG); +======= +>>>>>>> origin/develop #endif