Commit e516e536 authored by tyhsu's avatar tyhsu

turbo decoder gpu

parent 7f8b393d
/*! \file PHY\CUDA/LTE_TRANSPORT/turbo_parm.h
* \brief turbo decoder using gpu
* \author TerngYin Hsu, JianYa Chu
* \date 2018
* \version 0.1
* \company ISIP LAB/NCTU CS
* \email: tyhsu@cs.nctu.edu.tw
* \note
* \warning
*/
#include<stdio.h>
#include<stdlib.h>
#include<cuda_runtime.h>
typedef float llr_t;
typedef struct
{
llr_t *sys_d;
llr_t *sys1_d;
llr_t *sys2_d;
llr_t *ypar1_d;
llr_t *ypar2_d;
llr_t *ext_d;
llr_t *ext2_d;
llr_t *alpha_d;
int *decode_tmp;
llr_t *decode_ext2;
llr_t *alpha_pre_1;
llr_t *alpha_pre_2;
llr_t *beta_pre_1;
llr_t *beta_pre_2;
unsigned char *decode_h;
unsigned char *decode_d;
}turbo_parm_s;
typedef struct
{
// for excution time
cudaEvent_t e_start;
cudaEvent_t e_stop;
float e_time;
// for fetch time
cudaEvent_t f_start, f_stop;
float f_time;
// for memcpy
cudaEvent_t m_start, m_stop;
float m_time;
// for turbo kernel excution time
cudaEvent_t t_start, t_stop;
float t_time;
// for decode excution time
cudaEvent_t d_start, d_stop;
float d_time;
// for crc check
cudaEvent_t s_check[3];
// for algorithm & decode stream
cudaStream_t stream[2];
}cuda_parm_s;
turbo_parm_s* turbo_parm;
cuda_parm_s cuda_parm;
\ No newline at end of file
/*! \file PHY\CUDA/LTE_TRANSPORT/turbo_rx_gpu.cu
* \brief turbo decoder using gpu
* \author TerngYin Hsu, JianYa Chu
* \date 2018
* \version 0.1
* \company ISIP LAB/NCTU CS
* \email: tyhsu@cs.nctu.edu.tw
* \note
* \warning
*/
#include <stdlib.h>
#include <stdio.h>
#include <stdint.h>
#include <cuda_runtime.h>
#include <cuda.h>
#include <math.h>
#include "turbo_parm.h"
#include "PHY/CODING/extern_3GPPinterleaver.h"
//#include "extern_interleaver.h"
#include "PHY/CODING/defs.h"
#include "turbo_rx_gpu.h"
//#include "crc_byte.h"
//#include "turbo_rx.h"
//#include "PHY/CODING/extern_table_gpu.h"
//#include "extern_table_gpu.h"
#include "PHY/defs.h"
//typedef int16_t llr_t;
#define CRC24_A 0
#define CRC24_B 1
#define CRC16 2
#define CRC8 3
int intable_h[188][6144];
int detable_h[188][6144];
void free_ptr()
{
cudaFree(turbo_parm->sys_d);
cudaFree(turbo_parm->sys1_d);
cudaFree(turbo_parm->sys2_d);
cudaFree(turbo_parm->ypar1_d);
cudaFree(turbo_parm->ypar2_d);
cudaFree(turbo_parm->alpha_d);
cudaFree(turbo_parm->alpha_pre_1);
cudaFree(turbo_parm->alpha_pre_2);
cudaFree(turbo_parm->beta_pre_1);
cudaFree(turbo_parm->beta_pre_2);
cudaFree(turbo_parm->ext_d);
cudaFree(turbo_parm->ext2_d);
cudaFree(turbo_parm->decode_ext2);
cudaFree(turbo_parm->decode_tmp);
cudaFreeHost(turbo_parm->decode_h);
free(turbo_parm);
char i;
for(i=0;i<2;i++)
{
cudaStreamDestroy(cuda_parm.stream[i]);
}
}
__constant__ int alpha_table_0[32];
__constant__ int alpha_table_1[32];
__constant__ int beta_table_0[32];
__constant__ int beta_table_1[32];
__constant__ float alpha_par_table_0[32];
__constant__ float alpha_par_table_1[32];
__constant__ float beta_par_table_0[32];
__constant__ float beta_par_table_1[32];
__constant__ int interleaver[6144];
__constant__ int de_interleaver[6144];
void init_alloc()
{
size_t pitch;
cudaDeviceProp deviceprop;
cudaGetDeviceProperties(&deviceprop,0);
cudaSetDeviceFlags(cudaDeviceMapHost);
if(deviceprop.canMapHostMemory!=1)
printf("cudaError:cannot map host to device memory\n");
cudaError_t result;
turbo_parm = (turbo_parm_s*)malloc(sizeof(turbo_parm_s));
// allocate CUDA memory
result = cudaMallocPitch((void**)&turbo_parm->sys_d, &pitch, 16*6144*sizeof(llr_t) ,1);
if(result!=cudaSuccess)
printf("cudaMalloc turbo_parm->sys_d failed, err_num=%d\n",result);
result = cudaMallocPitch((void**)&turbo_parm->sys1_d, &pitch, 16*6144*sizeof(llr_t) ,1);
if(result!=cudaSuccess)
printf("cudaMalloc turbo_parm->sys1_d failed, err_num=%d\n",result);
result = cudaMallocPitch((void**)&turbo_parm->sys2_d, &pitch, 16*6144*sizeof(llr_t) ,1);
if(result!=cudaSuccess)
printf("cudaMalloc turbo_parm->sys2_d failed, err_num=%d\n",result);
result = cudaMallocPitch((void**)&turbo_parm->ypar1_d, &pitch, 16*6144*sizeof(llr_t) ,1);
if(result!=cudaSuccess)
printf("cudaMalloc turbo_parm->ypar1_d failed, err_num=%d\n",result);
result = cudaMallocPitch((void**)&turbo_parm->ypar2_d, &pitch, 16*6144*sizeof(llr_t) ,1);
if(result!=cudaSuccess)
printf("cudaMalloc turbo_parm->ypar2_d failed, err_num=%d\n",result);
result = cudaMallocPitch((void**)&turbo_parm->alpha_d, &pitch, 16*8*(6144+648)*sizeof(llr_t) ,1);
if(result!=cudaSuccess)
printf("cudaMalloc turbo_parm->alpha failed, err_num=%d\n",result);
result = cudaMallocPitch((void**)&turbo_parm->alpha_pre_1, &pitch, 16*32*162*4*sizeof(llr_t) ,1);
if(result!=cudaSuccess)
printf("cudaMalloc turbo_parm->alpha_pre_1 failed, err_num=%d\n",result);
result = cudaMallocPitch((void**)&turbo_parm->alpha_pre_2, &pitch, 16*32*162*4*sizeof(llr_t) ,1);
if(result!=cudaSuccess)
printf("cudaMalloc turbo_parm->alpha_pre_2 failed, err_num=%d\n",result);
result = cudaMallocPitch((void**)&turbo_parm->beta_pre_1, &pitch, 16*32*162*4*sizeof(llr_t) ,1);
if(result!=cudaSuccess)
printf("cudaMalloc turbo_parm->beta_pre1 failed, err_num=%d\n",result);
result = cudaMallocPitch((void**)&turbo_parm->beta_pre_2, &pitch, 16*32*162*4*sizeof(llr_t) ,1);
if(result!=cudaSuccess)
printf("cudaMalloc turbo_parm->beta_pre2 failed, err_num=%d\n",result);
result = cudaMallocPitch((void**)&turbo_parm->ext_d, &pitch, 16*6144*sizeof(llr_t) ,1);
if(result!=cudaSuccess)
printf("cudaMalloc turbo_parm->ext_d failed, err_num=%d\n",result);
result = cudaMallocPitch((void**)&turbo_parm->ext2_d, &pitch, 16*6144*sizeof(llr_t) ,1);
if(result!=cudaSuccess)
printf("cudaMalloc turbo_parm->ext2_d failed, err_num=%d\n",result);
result = cudaMallocPitch((void**)&turbo_parm->decode_ext2, &pitch, 3*16*6144*sizeof(llr_t) ,1);
if(result!=cudaSuccess)
printf("cudaMalloc turbo_parm->ext2_d failed, err_num=%d\n",result);
result = cudaMallocPitch((void**)&turbo_parm->decode_tmp, &pitch, 3*16*6144*sizeof(int) ,1);
if(result!=cudaSuccess)
printf("cudaMalloc turbo_parm->ext2_d failed, err_num=%d\n",result);
result = cudaHostAlloc((void**)&turbo_parm->decode_h,3*16*768*sizeof(unsigned char), cudaHostAllocMapped);
if(result!=cudaSuccess)
printf("cudaHostAlloc turbo_parm->decode_h filaed, err_num=%d\n",result);
// get device pointer
result = cudaHostGetDevicePointer(&turbo_parm->decode_d, turbo_parm->decode_h, 0);
if(result!=cudaSuccess)
printf("cuda get device pinter decode_d failed, err_num=%d\n",result);
// memset for mem
cudaMemset(turbo_parm->ext2_d,0,16*6144*sizeof(llr_t));
cudaMemset(turbo_parm->alpha_d,0,16*8*(6144+648)*sizeof(llr_t));
cudaMemset(turbo_parm->decode_tmp,0,16*6144*sizeof(llr_t));
//memset(turbo_parm->decode_h,0,16*768*sizeof(char));
// init table for decoder
int a_table_0[32]={0,3,4,7,1,2,5,6,8,11,12,15,9,10,13,14,16,19,20,23,17,18,21,22,24,27,28,31,25,26,29,30};
cudaMemcpyToSymbol(alpha_table_0,a_table_0,32*sizeof(int));
int a_table_1[32]={1,2,5,6,0,3,4,7,9,10,13,14,8,11,12,15,17,18,21,22,16,19,20,23,25,26,29,30,24,27,28,31};
cudaMemcpyToSymbol(alpha_table_1,a_table_1,32*sizeof(int));
float a_p_table_0[32] = {0.0 ,1.0 ,1.0 ,0.0 ,0.0 ,1.0 ,1.0 ,0.0 ,0.0 ,1.0 ,1.0 ,0.0 ,0.0 ,1.0 ,1.0 ,0.0 ,0.0 ,1.0 ,1.0 ,0.0 ,0.0 ,1.0 ,1.0 ,0.0 ,0.0 ,1.0 ,1.0 ,0.0 ,0.0 ,1.0 ,1.0 ,0.0};
cudaMemcpyToSymbol(alpha_par_table_0,a_p_table_0,32*sizeof(llr_t));
float a_p_table_1[32] = {1.0 ,0.0 ,0.0 ,1.0 ,1.0 ,0.0 ,0.0 ,1.0 ,1.0 ,0.0 ,0.0 ,1.0 ,1.0 ,0.0 ,0.0 ,1.0 ,1.0 ,0.0 ,0.0 ,1.0 ,1.0 ,0.0 ,0.0 ,1.0 ,1.0 ,0.0 ,0.0 ,1.0 ,1.0 ,0.0 ,0.0 ,1.0};
cudaMemcpyToSymbol(alpha_par_table_1,a_p_table_1,32*sizeof(llr_t));
int b_table_0[32]={0,4,5,1,2,6,7,3,8,12,13,9,10,14,15,11,16,20,21,17,18,22,23,19,24,28,29,25,26,30,31,27};
cudaMemcpyToSymbol(beta_table_0,b_table_0,32*sizeof(int));
int b_table_1[32]={4,0,1,5,6,2,3,7,12,8,9,13,14,10,11,15,20,16,17,21,22,18,19,23,28,24,25,29,30,26,27,31};
cudaMemcpyToSymbol(beta_table_1,b_table_1,32*sizeof(int));
float b_p_table_0[32] = {0.0 ,0.0 ,1.0 ,1.0 ,1.0 ,1.0 ,0.0 ,0.0 , 0.0 ,0.0 ,1.0 ,1.0 ,1.0 ,1.0 ,0.0 ,0.0 , 0.0 ,0.0 ,1.0 ,1.0 ,1.0 ,1.0 ,0.0 ,0.0 , 0.0 ,0.0 ,1.0 ,1.0 ,1.0 ,1.0 ,0.0 ,0.0};
cudaMemcpyToSymbol(beta_par_table_0,b_p_table_0,32*sizeof(llr_t));
float b_p_table_1[32] = {1.0 ,1.0 ,0.0 ,0.0 ,0.0 ,0.0 ,1.0 ,1.0 , 1.0 ,1.0 ,0.0 ,0.0 ,0.0 ,0.0 ,1.0 ,1.0 , 1.0 ,1.0 ,0.0 ,0.0 ,0.0 ,0.0 ,1.0 ,1.0 , 1.0 ,1.0 ,0.0 ,0.0 ,0.0 ,0.0 ,1.0 ,1.0};
cudaMemcpyToSymbol(beta_par_table_1,b_p_table_1,32*sizeof(llr_t));
// build de-interleaver table and interleaver table
int i, j;
unsigned long n;
unsigned short f1, f2;
for(j=0;j<188;j++)
{
n = f1f2mat[j].nb_bits;
f1 = f1f2mat[j].f1;
f2 = f1f2mat[j].f2;
for(i=0;i<n;i++)
{
intable_h[j][i] = (((f1+f2*i)%n)*i)%n;
detable_h[j][(((f1+f2*i)%n)*i)%n] = i;
}
}
// for crc check and stream create
for(i=0;i<2;i++)
{
cudaStreamCreate(&cuda_parm.stream[i]);
}
for(i=0;i<3;i++)
{
cudaEventCreate(&cuda_parm.s_check[i]);
}
}
__device__ void compute_alpha(float* sys, float* sys1, float* sys2,
float* par,
float* alpha, float* alpha_tmp,
float* alpha_pre_1, float* alpha_pre_2,
int num_per_block, int iteration_cnt, int decoder_id, int n, int codeword_num)
{
int alpha_start = blockIdx.y*(n+gridDim.x*4)*8 + blockIdx.x*(num_per_block+1)*8*4;
int index = blockIdx.y*n + blockIdx.x*num_per_block*4 + num_per_block*threadIdx.y;
llr_t r0, r1;
char i;
alpha_tmp[threadIdx.x + 8*threadIdx.y] = 0;
if(!(iteration_cnt==0 || (iteration_cnt==1 && decoder_id==2)))
{
if(!(blockIdx.x==0 && threadIdx.y==0))
{
if(decoder_id==1)
alpha_tmp[threadIdx.x + 8*threadIdx.y] = alpha_pre_1[blockIdx.y*gridDim.x*32 + blockIdx.x*32 + threadIdx.x + 8*threadIdx.y -8];
else
alpha_tmp[threadIdx.x + 8*threadIdx.y] = alpha_pre_2[blockIdx.y*gridDim.x*32 + blockIdx.x*32 + threadIdx.x + 8*threadIdx.y -8];
}
}
alpha[alpha_start + threadIdx.x + 8*threadIdx.y] = alpha_tmp[threadIdx.x + 8*threadIdx.y];
__syncthreads();
for(i=0;i<num_per_block;i++)
{
if(decoder_id==1)
{
r0 = alpha_par_table_0[threadIdx.x + 8*threadIdx.y]*par[index + i];
r1 = sys1[index + i] + alpha_par_table_1[threadIdx.x + 8*threadIdx.y]*par[index + i];
}
else
{
r0 = alpha_par_table_0[threadIdx.x + 8*threadIdx.y]*par[index + i];
r1 = sys2[index + i] + alpha_par_table_1[threadIdx.x + 8*threadIdx.y]*par[index + i];
}
alpha[alpha_start + (i+1)*32 + threadIdx.x + 8*threadIdx.y] = fmaxf(alpha_tmp[alpha_table_0[threadIdx.x+8*threadIdx.y]] + r0, alpha_tmp[alpha_table_1[threadIdx.x + 8*threadIdx.y]] + r1);
__syncthreads();
alpha_tmp[threadIdx.x + 8*threadIdx.y] = alpha[alpha_start + (i+1)*32 + threadIdx.x + 8*threadIdx.y];
if(i==num_per_block-1)
{
if(decoder_id==1)
alpha_pre_1[blockIdx.y*gridDim.x*32 + blockIdx.x*32 + threadIdx.x + 8*threadIdx.y] = alpha_tmp[threadIdx.x + 8*threadIdx.y];
else
alpha_pre_2[blockIdx.y*gridDim.x*32 + blockIdx.x*32 + threadIdx.x + 8*threadIdx.y] = alpha_tmp[threadIdx.x + 8*threadIdx.y];
}
}
}
__device__ void compute_beta_ext(float* sys, float* sys1, float* sys2,
float* par,
float* alpha, float* beta_now, float* beta_next,
float* beta_pre_1, float* beta_pre_2,
float* ext_tmp0, float* ext_tmp1,
float* ext, float* ext2, float* decode_ext2,
int num_per_block, int iteration_cnt, int decoder_id, int n, int codeword_num)
{
llr_t a, r0, r1, max_0, max_1;
int alpha_start = blockIdx.y*(n+gridDim.x*4)*8 + blockIdx.x*(num_per_block+1)*8*4;
int index = blockIdx.y*n + blockIdx.x*num_per_block*4 + threadIdx.y*num_per_block;
int index2;
char i,j;
beta_now[threadIdx.x + 8*threadIdx.y] = 0;
if(!(iteration_cnt==0 || (iteration_cnt==1 && decoder_id==2)))
{
if(!(blockIdx.x==gridDim.x-1 && threadIdx.y==3))
{
if(decoder_id==1)
beta_now[threadIdx.x + 8*threadIdx.y] = beta_pre_1[blockIdx.y*gridDim.x*32 + blockIdx.x*32 + threadIdx.x + 8*threadIdx.y +8];
else
beta_now[threadIdx.x + 8*threadIdx.y] = beta_pre_2[blockIdx.y*gridDim.x*32 + blockIdx.x*32 + threadIdx.x + 8*threadIdx.y +8];
}
}
__syncthreads();
for(i=num_per_block-1;i>=0;i--)
{
if(decoder_id==1)
{
r0 = beta_par_table_0[threadIdx.x + 8*threadIdx.y]*par[index + i];
r1 = sys1[index + i] + beta_par_table_1[threadIdx.x + 8*threadIdx.y]*par[index + i];
}
else
{
r0 = beta_par_table_0[threadIdx.x + 8*threadIdx.y]*par[index + i];
r1 = sys2[index + i] + beta_par_table_1[threadIdx.x + 8*threadIdx.y]*par[index + i];
}
a = alpha[alpha_start + 32*i + threadIdx.x + 8*threadIdx.y];
beta_next[threadIdx.x + 8*threadIdx.y] = fmaxf(beta_now[beta_table_0[threadIdx.x+8*threadIdx.y]] + r0, beta_now[beta_table_1[threadIdx.x + 8*threadIdx.y]] + r1);
if(i==0)
{
if(decoder_id==1)
beta_pre_1[blockIdx.y*gridDim.x*32 + blockIdx.x*32 + threadIdx.x + 8*threadIdx.y] = beta_next[threadIdx.x + 8*threadIdx.y];
else
beta_pre_2[blockIdx.y*gridDim.x*32 + blockIdx.x*32 + threadIdx.x + 8*threadIdx.y] = beta_next[threadIdx.x + 8*threadIdx.y];
}
ext_tmp0[((num_per_block-1-i)&7)*32 + threadIdx.x + 8*threadIdx.y] = a + r0 + beta_now[beta_table_0[threadIdx.x + 8*threadIdx.y]];
ext_tmp1[((num_per_block-1-i)&7)*32 + threadIdx.x + 8*threadIdx.y] = a + r1 + beta_now[beta_table_1[threadIdx.x + 8*threadIdx.y]];
__syncthreads();
beta_now[threadIdx.x + 8*threadIdx.y] = beta_next[threadIdx.x + 8*threadIdx.y];
if(((num_per_block-1-i)&7)==7)
{
max_0 = ext_tmp0[(7-threadIdx.x)*32 + threadIdx.x + 8*threadIdx.y];
max_1 = ext_tmp1[(7-threadIdx.x)*32 + threadIdx.x + 8*threadIdx.y];
for(j=1;j<8;j++)
{
index2 = (threadIdx.x + j)&7;
max_0 = fmaxf(max_0, ext_tmp0[(7-threadIdx.x)*32 + index2 + 8*threadIdx.y]);
max_1 = fmaxf(max_1, ext_tmp1[(7-threadIdx.x)*32 + index2 + 8*threadIdx.y]);
}
index2 = blockIdx.x*num_per_block*4 + threadIdx.y*num_per_block + i + threadIdx.x;
if(decoder_id==1)
{
ext[blockIdx.y*n + index2] = max_1 - max_0 - sys1[blockIdx.y*n + index2] + sys[blockIdx.y*n + index2];
sys2[blockIdx.y*n + de_interleaver[index2]] = ext[blockIdx.y*n + index2];
}
else
{
ext2[blockIdx.y*n + index2] = max_1 - max_0;
if(iteration_cnt >= 3)
{
decode_ext2[(iteration_cnt-3)*gridDim.y*n + blockIdx.y*n + interleaver[index2]] = max_1 - max_0;
}
sys1[blockIdx.y*n + interleaver[index2]] = max_1 - max_0 - ext[blockIdx.y*n + interleaver[index2]] + sys[blockIdx.y*n + interleaver[index2]];
}
}
else if(i==0 && (num_per_block&7)!=0 && threadIdx.x < (num_per_block&7))
{
max_0 = ext_tmp0[((num_per_block&7)-1-threadIdx.x)*32 + threadIdx.x + 8*threadIdx.y];
max_1 = ext_tmp1[((num_per_block&7)-1-threadIdx.x)*32 + threadIdx.x + 8*threadIdx.y];
for(j=1;j<8;j++)
{
index2 = (threadIdx.x + j)&7;
max_0 = fmaxf(max_0, ext_tmp0[((num_per_block&7)-1-threadIdx.x)*32 + index2 + 8*threadIdx.y]);
max_1 = fmaxf(max_1, ext_tmp1[((num_per_block&7)-1-threadIdx.x)*32 + index2 + 8*threadIdx.y]);
}
index2 = blockIdx.x*num_per_block*4 + threadIdx.y*num_per_block + i + threadIdx.x;
if(decoder_id==1)
{
ext[blockIdx.y*n + index2] = max_1 - max_0 - sys1[blockIdx.y*n + index2] + sys[blockIdx.y*n + index2];
sys2[blockIdx.y*n + de_interleaver[index2]] = ext[blockIdx.y*n + index2];
}
else
{
ext2[blockIdx.y*n + index2] = max_1 - max_0;
if(iteration_cnt >= 3)
{
decode_ext2[(iteration_cnt-3)*gridDim.y*n + blockIdx.y*n + interleaver[index2]] = max_1 - max_0;
}
sys1[blockIdx.y*n + interleaver[index2]] = max_1 - max_0 - ext[blockIdx.y*n + interleaver[index2]] + sys[blockIdx.y*n + interleaver[index2]];
}
}
}
}
__global__ void log(float* sys, float* sys1, float* sys2,
float* par,
float* alpha,
float* alpha_pre_1, float* alpha_pre_2, float* beta_pre_1, float* beta_pre_2,
float* ext, float* ext2, float* decode_ext2,
int num_per_block, int iteration_cnt, int decoder_id, int n, int codeword_num
)
{
__shared__ llr_t alpha_tmp[32];
__shared__ llr_t beta_tmp[32];
__shared__ llr_t ext_tmp0[32*8];
__shared__ llr_t ext_tmp1[32*8];
compute_alpha(sys, sys1, sys2,
par,
alpha, alpha_tmp,
alpha_pre_1, alpha_pre_2,
num_per_block, iteration_cnt, decoder_id, n , codeword_num
);
__syncthreads();
compute_beta_ext(sys, sys1, sys2,
par,
alpha, alpha_tmp,beta_tmp,
beta_pre_1, beta_pre_2,
ext_tmp0, ext_tmp1,
ext, ext2, decode_ext2,
num_per_block, iteration_cnt, decoder_id, n, codeword_num
);
}
// for decoding
__global__ void decode(llr_t* decode_ext2, unsigned char* decode_d, int* decode_tmp, int n, int decode_len, int iteration_cnt)
{
int i, j;
for(i=threadIdx.x; i<n; i+=256)
{
decode_tmp[iteration_cnt*gridDim.x*n + blockIdx.x*n + i] = 0;
if(decode_ext2[iteration_cnt*gridDim.x*n + blockIdx.x*n + i] > 0)
{
decode_tmp[iteration_cnt*gridDim.x*n + blockIdx.x*n + i] = 1 << (7-(i&7));
}
}
__syncthreads();
for(i=threadIdx.x; i<decode_len; i+=256)
{
decode_d[iteration_cnt*gridDim.x*decode_len + blockIdx.x*decode_len + i] = 0;
for(j=0;j<8;j++)
{
decode_d[iteration_cnt*gridDim.x*decode_len + blockIdx.x*decode_len + i] += decode_tmp[iteration_cnt*gridDim.x*n + blockIdx.x*n + i*8 + j];
}
}
}
//#define TIME_EST
unsigned char phy_threegpplte_turbo_decoder_gpu(short **y,
unsigned char **decoded_bytes,
unsigned int codeword_num,
unsigned short n,
unsigned short f1,
unsigned short f2,
unsigned char max_iterations,
unsigned char crc_type,
unsigned char *f_tmp,
unsigned char* ret)
{
unsigned int i,j,iind,k;
#ifdef TIME_EST
cudaEventCreate(&cuda_parm.e_start);
cudaEventCreate(&cuda_parm.e_stop);
cuda_parm.e_time = 0;
cudaEventRecord(cuda_parm.e_start, cuda_parm.stream[0]);
#endif
llr_t sys_h[n*codeword_num], ypar1_h[n*codeword_num], ypar2_h[n*codeword_num];
unsigned char iteration_cnt=0;
unsigned int crc,oldcrc,crc_len;
uint8_t temp;
unsigned char F;
if (crc_type > 3) {
printf("Illegal crc length!\n");
return 255;
}
for (iind=0; f1f2mat[iind].nb_bits!=n && iind <188; iind++);
if ( iind == 188 ) {
printf("Illegal frame length!\n");
return 255;
}
switch (crc_type) {
case CRC24_A:
case CRC24_B:
crc_len=3;
break;
case CRC16:
crc_len=2;
break;
case CRC8:
crc_len=1;
break;
default:
crc_len=3;
}
// fetch data for each codeword
#ifdef TIME_EST
cudaEventCreate(&cuda_parm.f_start);
cudaEventCreate(&cuda_parm.f_stop);
cuda_parm.f_time = 0;
cudaEventRecord(cuda_parm.f_start,cuda_parm.stream[0]);
#endif
short* yp;
for(i=0;i<codeword_num;i++)
{
yp = y[i];
for(j=0;j<n;j++)
{
sys_h[j+n*i] = *yp;
ypar1_h[j+n*i] = *(yp+1);
ypar2_h[j+n*i] = *(yp+2);
yp+=3;
}
}
#ifdef TIME_EST
cudaEventRecord(cuda_parm.f_stop,cuda_parm.stream[0]);
cudaEventSynchronize(cuda_parm.f_stop);
cuda_parm.f_time = 0;
cudaEventElapsedTime(&cuda_parm.f_time, cuda_parm.f_start, cuda_parm.f_stop);
printf("Fetech data time = %f ms\n",cuda_parm.f_time);
#endif
// for kernel memcpy
#ifdef TIME_EST
cudaEventCreate(&cuda_parm.m_start);
cudaEventCreate(&cuda_parm.m_stop);
cudaEventRecord(cuda_parm.m_start, cuda_parm.stream[0]);
#endif
cudaMemcpyAsync(turbo_parm->sys_d,sys_h,codeword_num*n*sizeof(llr_t),cudaMemcpyHostToDevice, cuda_parm.stream[0]);
cudaMemcpyAsync(turbo_parm->ypar1_d,ypar1_h,codeword_num*n*sizeof(llr_t),cudaMemcpyHostToDevice, cuda_parm.stream[0]);
cudaMemcpyAsync(turbo_parm->ypar2_d,ypar2_h,codeword_num*n*sizeof(llr_t),cudaMemcpyHostToDevice, cuda_parm.stream[0]);
cudaMemcpyToSymbolAsync(interleaver, intable_h[iind],n*sizeof(int), 0, cudaMemcpyHostToDevice, cuda_parm.stream[0]);
cudaMemcpyToSymbolAsync(de_interleaver, detable_h[iind],n*sizeof(int), 0, cudaMemcpyHostToDevice, cuda_parm.stream[0]);
#ifdef TIME_EST
cudaEventRecord(cuda_parm.m_stop, cuda_parm.stream[0]);
cudaEventSynchronize(cuda_parm.m_stop);
cuda_parm.m_time=0;
cudaEventElapsedTime(&cuda_parm.m_time, cuda_parm.m_start, cuda_parm.m_stop);
printf("Memcpy Time For Kernel = %f ms\n",cuda_parm.m_time);
#endif
#ifdef TIME_EST
cuda_parm.t_time = 0;
cudaEventCreate(&cuda_parm.t_start);
cudaEventCreate(&cuda_parm.t_stop);
cudaEventRecord(cuda_parm.t_start,cuda_parm.stream[0]);
#endif
// decide block and thread
int blocknum=648;
while(blocknum!=8)
{
if(n%blocknum==0 && n/blocknum>=16)
{
break;
}
blocknum-=4;
}
dim3 threadnum(8,4);
size_t s_size = 0;
int num_per_block = n / blocknum;
blocknum = blocknum/4;
dim3 bb(blocknum, codeword_num);
llr_t ext2[n*codeword_num];
memset(ext2,0,sizeof(ext2));
llr_t tmp[n*codeword_num];
// for crc check
char check=0;
int z;
// log map algorithm
log<<<bb, threadnum, s_size, cuda_parm.stream[0]>>>(turbo_parm->sys_d, turbo_parm->sys_d, turbo_parm->sys2_d,
turbo_parm->ypar1_d,
turbo_parm->alpha_d,
turbo_parm->alpha_pre_1, turbo_parm->alpha_pre_2, turbo_parm->beta_pre_1, turbo_parm->beta_pre_2,
turbo_parm->ext_d, turbo_parm->ext2_d, turbo_parm->decode_ext2,
num_per_block, iteration_cnt, 1, n, codeword_num
);
while(iteration_cnt++ < max_iterations)
{
log<<<bb, threadnum, s_size, cuda_parm.stream[0]>>>(turbo_parm->sys_d, turbo_parm->sys1_d, turbo_parm->sys2_d,
turbo_parm->ypar2_d,
turbo_parm->alpha_d,
turbo_parm->alpha_pre_1, turbo_parm->alpha_pre_2, turbo_parm->beta_pre_1, turbo_parm->beta_pre_2,
turbo_parm->ext_d, turbo_parm->ext2_d, turbo_parm->decode_ext2,
num_per_block, iteration_cnt, 2, n, codeword_num
);
if(iteration_cnt>=3)
{
cudaEventRecord(cuda_parm.s_check[iteration_cnt-3], cuda_parm.stream[0]);
}
if(iteration_cnt < max_iterations)
{
log<<<bb, threadnum, s_size, cuda_parm.stream[0]>>>(turbo_parm->sys_d, turbo_parm->sys1_d, turbo_parm->sys2_d,
turbo_parm->ypar1_d,
turbo_parm->alpha_d,
turbo_parm->alpha_pre_1, turbo_parm->alpha_pre_2, turbo_parm->beta_pre_1, turbo_parm->beta_pre_2,
turbo_parm->ext_d, turbo_parm->ext2_d, turbo_parm->decode_ext2,
num_per_block, iteration_cnt, 1, n, codeword_num
);
}
}
cudaDeviceSynchronize();
#ifdef TIME_EST
cudaEventRecord(cuda_parm.t_stop, cuda_parm.stream[0]);
cudaEventSynchronize(cuda_parm.t_stop);
cudaEventElapsedTime(&cuda_parm.t_time, cuda_parm.t_start, cuda_parm.t_stop);
printf("Time For turbo algorithm Kernel = %f ms\n",cuda_parm.t_time);
#endif
int decode_len = n >> 3;
for(i=0;i<=2;i++)
{
check = 0;
// wait for turbo ext
cudaStreamWaitEvent(cuda_parm.stream[1], cuda_parm.s_check[i], 0);
#ifdef TIME_EST
cuda_parm.d_time = 0;
cudaEventCreate(&cuda_parm.d_start);
cudaEventCreate(&cuda_parm.d_stop);
cudaEventRecord(cuda_parm.d_start, cuda_parm.stream[1]);
#endif
// decode
decode<<<codeword_num, 256, 0, cuda_parm.stream[1]>>>(turbo_parm->decode_ext2, turbo_parm->decode_d, turbo_parm->decode_tmp, n, decode_len, i);
cudaStreamSynchronize(cuda_parm.stream[1]);
// crc check
for(j=0;j<codeword_num;j++)
{
F = f_tmp[1];
if(j==0)
{
F = f_tmp[0];
}
oldcrc = *((unsigned int *)(&turbo_parm->decode_h[i*codeword_num*decode_len + j*decode_len + decode_len-crc_len]));
switch(crc_type)
{
case CRC24_A:
oldcrc&=0x00ffffff;
crc = crc24a(&turbo_parm->decode_h[i*codeword_num*decode_len + j*decode_len + ( F>>3 )], n-24-F)>>8;
temp=((uint8_t *)&crc)[2];
((uint8_t *)&crc)[2] = ((uint8_t *)&crc)[0];
((uint8_t *)&crc)[0] = temp;
break;
case CRC24_B:
oldcrc&=0x00ffffff;
crc = crc24b(&turbo_parm->decode_h[i*codeword_num*decode_len + j*decode_len],
n-24)>>8;
temp=((uint8_t *)&crc)[2];
((uint8_t *)&crc)[2] = ((uint8_t *)&crc)[0];
((uint8_t *)&crc)[0] = temp;
break;
case CRC16:
oldcrc&=0x0000ffff;
crc = crc16(&turbo_parm->decode_h[i*codeword_num*decode_len + j*decode_len],
n-16)>>16;
break;
case CRC8:
oldcrc&=0x000000ff;
crc = crc8(&turbo_parm->decode_h[i*codeword_num*decode_len + j*decode_len],
n-8)>>24;
break;
default:
printf("FATAL: 3gpplte_turbo_decoder_sse.c: Unknown CRC\n");
return(255);
break;
}
if ((crc == oldcrc) && (crc!=0)) {
ret[j] = i+3-1;
}
else
{
check = 1;
}
}
#ifdef TIME_EST
cudaEventRecord(cuda_parm.d_stop, cuda_parm.stream[1]);
cudaEventSynchronize(cuda_parm.d_stop);
cudaEventElapsedTime(&cuda_parm.d_time, cuda_parm.d_start, cuda_parm.d_stop);
printf("Time For decode & crc check = %f ms\n",cuda_parm.d_time);
#endif
if(check==0)
{
for(j=0;j<codeword_num;j++)
{
for(k=0;k<decode_len;k++)
{
decoded_bytes[j][k] = turbo_parm->decode_h[i*codeword_num*decode_len + j*decode_len + k];
}
}
#ifdef TIME_EST
cudaEventRecord(cuda_parm.e_stop, cuda_parm.stream[0]);
cudaEventSynchronize(cuda_parm.e_stop);
cudaEventElapsedTime(&cuda_parm.e_time, cuda_parm.e_start, cuda_parm.e_stop);
printf("Time For CUDA = %f ms\n",cuda_parm.e_time);
#endif
return i+3-1;
}
}
// crc check fail
for(i=0;i<codeword_num;i++)
{
for(j=0;j<decode_len;j++)
{
decoded_bytes[i][j] = turbo_parm->decode_h[2*codeword_num*decode_len + i*decode_len + j];
}
ret[i] = 5;
//return 5;
}
#ifdef TIME_EST
cudaEventRecord(cuda_parm.e_stop, cuda_parm.stream[0]);
cudaEventSynchronize(cuda_parm.e_stop);
cudaEventElapsedTime(&cuda_parm.e_time, cuda_parm.e_start, cuda_parm.e_stop);
printf("Time For CUDA = %f ms\n",cuda_parm.e_time);
#endif
for(i=0;i<2;i++)
{
cudaStreamSynchronize(cuda_parm.stream[i]);
}
return 5;
}
\ No newline at end of file
/*! \file PHY\CUDA/LTE_TRANSPORT/turbo_rx_gpu.h
* \brief turbo decoder using gpu
* \author TerngYin Hsu, JianYa Chu
* \date 2018
* \version 0.1
* \company ISIP LAB/NCTU CS
* \email: tyhsu@cs.nctu.edu.tw
* \note
* \warning
*/
#ifndef __TURBO_RX_GPU__H__
#define __TURBO_RX_GPU__H__
#include <stdio.h>
#include <stdlib.h>
#include <stdint.h>
#define CRC24_A 0
#define CRC24_B 1
#define CRC16 2
#define CRC8 3
typedef char Binary;
typedef float llr_t;
typedef short channel_t;
#ifdef __cplusplus
extern "C"
#endif
unsigned char phy_threegpplte_turbo_decoder_gpu(short **y,
unsigned char **decoded_bytes,
unsigned int codeword_num,
unsigned short n,
unsigned short f1,
unsigned short f2,
unsigned char max_iterations,
unsigned char crc_type,
unsigned char* f_tmp,
unsigned char* ret);
#ifdef __cplusplus
extern "C"
#endif
void free_ptr(void);
#ifdef __cplusplus
extern "C"
#endif
void init_alloc(void);
#endif
Markdown is supported
0%
or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment