Commit 21c5e485 authored by tyhsu's avatar tyhsu

v3. decrease GMEM access in beamforming precode

parent 6a271962
...@@ -116,7 +116,7 @@ extern "C" void CUDA_ifft_ofdm( int **output, ...@@ -116,7 +116,7 @@ extern "C" void CUDA_ifft_ofdm( int **output,
int threadNum = 1024; int threadNum = 1024;
int blockNum = fftsize*nb_symbols*nb_tx / threadNum; int blockNum = fftsize*nb_symbols*nb_tx / threadNum;
cu_intToComplex<<<blockNum, threadNum>>>(d_txdataF_BF, d_signal); //cu_intToComplex<<<blockNum, threadNum>>>(d_txdataF_BF, d_signal);
//CHECK_STATE("cu_intToComplex"); //CHECK_STATE("cu_intToComplex");
cufftErrchk( cufftExecC2C(plan, (cufftComplex *)d_signal, (cufftComplex *)d_signal, CUFFT_INVERSE)); cufftErrchk( cufftExecC2C(plan, (cufftComplex *)d_signal, (cufftComplex *)d_signal, CUFFT_INVERSE));
...@@ -144,70 +144,78 @@ extern "C" void CUDA_ifft_ofdm( int **output, ...@@ -144,70 +144,78 @@ extern "C" void CUDA_ifft_ofdm( int **output,
} }
__device__ inline void beamComp(int *res, int *x1, int *x2){ __device__ inline void beamComp(int *res, int *x1, int *x2){
((short*)res)[0] = ((short*)x1)[0]*((short*)x2)[0] + ((short*)x1)[1]*((short*)x2)[1]; ((short*)res)[0] += ((short*)x1)[0]*((short*)x2)[0] + ((short*)x1)[1]*((short*)x2)[1];
((short*)res)[1] = ((short*)x1)[0]*((short*)x2)[1] - ((short*)x1)[1]*((short*)x2)[0]; ((short*)res)[1] += ((short*)x1)[0]*((short*)x2)[1] - ((short*)x1)[1]*((short*)x2)[0];
} }
__global__ void conjMulAll(int* txdataF, int* weight, int* res, extern __constant__ int PORTSIZE;
extern __constant__ int SUBTXSIZE;
extern __constant__ int BW_PSIZE;
__global__ void conjMulAll(int* txdataF, int* weight, int* sub,
int fftsize, int nb_symbols, int nb_tx, int nb_antenna_ports){ int fftsize, int nb_symbols, int nb_tx, int nb_antenna_ports){
__shared__ int x1[2048*5]; __shared__ int x1[2048*4];
int symbSart = blockIdx.x*5; __shared__ int res[2048];
int portId = blockIdx.y;
int id = threadIdx.x; int id = threadIdx.x;
int id2 = id+1024; int id2 = id+1024;
int aaSize = nb_antenna_ports*nb_symbols*fftsize; int symbId = blockIdx.x;
int portSize = nb_symbols*fftsize; int portStart = blockIdx.y*4;
int subtxId = blockIdx.y;
int s1=0; int s1 = 0;
for(int symbId=symbSart; symbId<(symbSart+5)&&symbId<nb_symbols; symbId++){ for(int p=portStart; p<portStart+4; p++){
x1[s1*fftsize+id] = txdataF[symbId*fftsize+id]; x1[s1*2048+id] = txdataF[p*PORTSIZE + symbId*fftsize + id];
x1[s1*fftsize+id2] = txdataF[symbId*fftsize+id2]; x1[s1*2048+id2] = txdataF[p*PORTSIZE + symbId*fftsize + id2];
s1++; s1++;
} }
for(int aa=0; aa<nb_tx; aa++){ for(int aa=0; aa<nb_tx; aa++){
for(int symbId=symbSart; symbId<(symbSart+5)&&symbId<nb_symbols; symbId++){ res[id] = 0;
int resId = aa*aaSize+portId*portSize+symbId*fftsize; res[id2] = 0;
s1 = symbId%5; s1 = 0;
beamComp(&res[resId+id], &x1[s1*fftsize+id], &weight[portId*(nb_tx*fftsize)+aa*fftsize+id]); for(int p=portStart; p<portStart+4; p++){
beamComp(&res[resId+id2], &x1[s1*fftsize+id2], &weight[portId*(nb_tx*fftsize)+aa*fftsize+id2]); beamComp(&res[id], &x1[s1*2048+id], &weight[p*BW_PSIZE+aa*fftsize+id]);
beamComp(&res[id2], &x1[s1*2048+id2], &weight[p*BW_PSIZE+aa*fftsize+id2]);
/*
if(id==0){
printf("%5d+%5di mul %5d+%5di = %5d+%5di\n",
((short*)&x1[s1*2048+id])[0], ((short*)&x1[s1*2048+id])[1],
((short*)&weight[p*BW_PSIZE+aa*fftsize+id])[0],((short*)&weight[p*BW_PSIZE+aa*fftsize+id])[1],
((short*)&res[id])[0], ((short*)&res[id])[1]);
}*/
s1++;
} }
}
int offset = subtxId*SUBTXSIZE + aa*PORTSIZE + symbId*fftsize;
sub[offset+id] = res[id];
sub[offset+id2] = res[id2];
}
} }
__device__ inline void partAdd(int *res, int *x){ __device__ inline void partAdd(Complex *res, int *x1, int *x2){
((short*)res)[0] += ((short*)x)[0]; res->x = ((short*)x1)[0] + ((short*)x2)[0];
((short*)res)[1] += ((short*)x)[1]; res->y = ((short*)x1)[1] + ((short*)x2)[1];
} }
__global__ void combine(int* res, int* txdataF_BF, int fftsize, int nb_symbols, int nb_tx, int nb_antenna_ports){ __global__ void combine(int* subtx, Complex* d_signal, int fftsize, int nb_symbols, int nb_tx, int nb_antenna_ports){
__shared__ int buf[2048*5];
int symbStart = blockIdx.x;
int txId = blockIdx.y;
int id = threadIdx.x; int id = threadIdx.x;
int id2 = id+1024; int id2 = id+1024;
int txSize = nb_antenna_ports*nb_symbols*fftsize; int aa = blockIdx.x;
int portSize = nb_symbols*fftsize; int symbStart = blockIdx.y*7;
int symbEnd = symbStart + 7;
int s1=0;
for(int p=0; p<nb_antenna_ports; p++){ for(int symb=symbStart; symb<symbEnd; symb++){
for(int symbId=symbStart; symbId<symbId+5&&symbId<nb_symbols; symbId++){ int offset = aa*PORTSIZE + symb*fftsize;
s1 = symbId%5; partAdd(&d_signal[offset+id], &subtx[offset+id], &subtx[SUBTXSIZE+offset+id]);
partAdd(&buf[s1+id], &res[txId*txSize+p*portSize+symbId*fftsize+id]); partAdd(&d_signal[offset+id2], &subtx[offset+id2], &subtx[SUBTXSIZE+offset+id2]);
partAdd(&buf[s1+id2], &res[txId*txSize+p*portSize+symbId*fftsize+id2]); //if(id==0) printf("%5.5f+%5.5fi\n", d_signal[offset+id].x, d_signal[offset+id].y);
}
}
for(int symbId=symbStart; symbId<symbId+5&&symbId<nb_symbols; symbId++){
s1 = symbId%5;
res[txId*nb_symbols*fftsize+symbId*fftsize+id] = buf[s1*fftsize+id];
res[txId*nb_symbols*fftsize+symbId*fftsize+id2] = buf[s1*fftsize+id2];
} }
} }
extern "C" void CUDA_beam_precoding(int **txdataF, int ***weight, int L_ssb, int shift, int fftsize, int nb_symbols, int nb_antenna_ports, int nb_tx){ extern "C" void CUDA_beam_precoding(int **txdataF, int ***weight, int L_ssb, int shift, int fftsize, int nb_symbols, int nb_antenna_ports, int nb_tx){
cudaEvent_t start, stop; cudaEvent_t start, stop;
float time; float time;
...@@ -216,6 +224,7 @@ extern "C" void CUDA_beam_precoding(int **txdataF, int ***weight, int L_ssb, int ...@@ -216,6 +224,7 @@ extern "C" void CUDA_beam_precoding(int **txdataF, int ***weight, int L_ssb, int
//initial BF data; //initial BF data;
gpuErrchk( cudaMemset(cu_ru.d_txdataF_BF, 0, fftsize*nb_symbols*sizeof(int)*nb_tx) ); gpuErrchk( cudaMemset(cu_ru.d_txdataF_BF, 0, fftsize*nb_symbols*sizeof(int)*nb_tx) );
gpuErrchk( cudaMemset(cu_ru.d_subtx, 0, fftsize*nb_symbols*nb_tx*2*sizeof(int)) );
//move data to gpu //move data to gpu
int slotsize = fftsize*nb_symbols; int slotsize = fftsize*nb_symbols;
for(int p=0; p<nb_antenna_ports; p++){ for(int p=0; p<nb_antenna_ports; p++){
...@@ -223,8 +232,6 @@ extern "C" void CUDA_beam_precoding(int **txdataF, int ***weight, int L_ssb, int ...@@ -223,8 +232,6 @@ extern "C" void CUDA_beam_precoding(int **txdataF, int ***weight, int L_ssb, int
} }
cudaEventRecord(start); cudaEventRecord(start);
int threadNum = 1024;
int blockNum = fftsize*nb_symbols/threadNum;
int div = 1<<shift; int div = 1<<shift;
for(int aa=0; aa<nb_tx; aa++){ for(int aa=0; aa<nb_tx; aa++){
for(int p=0; p<nb_antenna_ports; p++){ for(int p=0; p<nb_antenna_ports; p++){
...@@ -241,11 +248,12 @@ extern "C" void CUDA_beam_precoding(int **txdataF, int ***weight, int L_ssb, int ...@@ -241,11 +248,12 @@ extern "C" void CUDA_beam_precoding(int **txdataF, int ***weight, int L_ssb, int
cudaEventRecord(start); cudaEventRecord(start);
dim3 block(3,8); dim3 block(14,2,1);
dim3 thread(1024); dim3 thread(1024);
conjMulAll<<<block, thread>>>(cu_ru.d_txdataF, cu_ru.d_weight, cu_ru.d_res, conjMulAll<<<block, thread>>>(cu_ru.d_txdataF, cu_ru.d_weight, cu_ru.d_subtx,
fftsize, nb_symbols, nb_tx, nb_antenna_ports); fftsize, nb_symbols, nb_tx, nb_antenna_ports);
combine<<<block, thread>>>(cu_ru.d_res, cu_ru.d_txdataF_BF, block = dim3(8, 2, 1);
combine<<<block, thread>>>(cu_ru.d_subtx, cu_ru.d_signal,
fftsize, nb_symbols, nb_tx, nb_antenna_ports); fftsize, nb_symbols, nb_tx, nb_antenna_ports);
cudaEventRecord(stop); cudaEventRecord(stop);
......
...@@ -15,7 +15,7 @@ typedef struct cuda_cu_ru_t{ ...@@ -15,7 +15,7 @@ typedef struct cuda_cu_ru_t{
//beamforming precoding //beamforming precoding
int *d_txdataF;//14symb-port0, 14symb-port1, ...... int *d_txdataF;//14symb-port0, 14symb-port1, ......
int *d_weight;//[p * tx * fftsize] int *d_weight;//[p * tx * fftsize]
int *d_res; int *d_subtx;//14symb-subport0, 14symb-subport1, ..., 14symb-subport0, 14symb-subport1, ...
//ifft //ifft
int *d_txdataF_BF;//14symb-tx0, 14symb-tx1, ...... int *d_txdataF_BF;//14symb-tx0, 14symb-tx1, ......
...@@ -26,6 +26,7 @@ typedef struct cuda_cu_ru_t{ ...@@ -26,6 +26,7 @@ typedef struct cuda_cu_ru_t{
extern cuda_cu_ru cu_ru; extern cuda_cu_ru cu_ru;
#if __cplusplus #if __cplusplus
} }
#endif #endif
......
...@@ -41,6 +41,10 @@ ...@@ -41,6 +41,10 @@
cuda_cu_ru cu_ru; cuda_cu_ru cu_ru;
__constant__ int PORTSIZE;
__constant__ int SUBTXSIZE;
__constant__ int BW_PSIZE;
extern "C" void init_cuda(int nb_tx, int nb_symbols, int fftsize){ extern "C" void init_cuda(int nb_tx, int nb_symbols, int fftsize){
printf("init_cuda %d %d %d \n\n\n", nb_tx, nb_symbols, fftsize); printf("init_cuda %d %d %d \n\n\n", nb_tx, nb_symbols, fftsize);
...@@ -49,7 +53,7 @@ extern "C" void init_cuda(int nb_tx, int nb_symbols, int fftsize){ ...@@ -49,7 +53,7 @@ extern "C" void init_cuda(int nb_tx, int nb_symbols, int fftsize){
//beamforming precoding //beamforming precoding
gpuErrchk( cudaMalloc((void**)&cu_ru.d_txdataF, sizeof(int) * nb_tx*nb_antenna_ports*nb_symbols*fftsize) ); gpuErrchk( cudaMalloc((void**)&cu_ru.d_txdataF, sizeof(int) * nb_tx*nb_antenna_ports*nb_symbols*fftsize) );
gpuErrchk( cudaMalloc((void**)&cu_ru.d_weight, sizeof(int) * nb_tx*nb_antenna_ports*fftsize) ); gpuErrchk( cudaMalloc((void**)&cu_ru.d_weight, sizeof(int) * nb_tx*nb_antenna_ports*fftsize) );
gpuErrchk( cudaMalloc((void**)&cu_ru.d_res, sizeof(int) * nb_tx*nb_antenna_ports*fftsize*nb_symbols) ); gpuErrchk( cudaMalloc((void**)&cu_ru.d_subtx, sizeof(int) * nb_tx*fftsize*nb_symbols*2) );
//ifft //ifft
gpuErrchk( cudaMalloc((void**)&cu_ru.d_txdataF_BF, fftsize*sizeof(int)*nb_symbols*nb_tx) ); gpuErrchk( cudaMalloc((void**)&cu_ru.d_txdataF_BF, fftsize*sizeof(int)*nb_symbols*nb_tx) );
...@@ -57,6 +61,11 @@ extern "C" void init_cuda(int nb_tx, int nb_symbols, int fftsize){ ...@@ -57,6 +61,11 @@ extern "C" void init_cuda(int nb_tx, int nb_symbols, int fftsize){
gpuErrchk( cudaMalloc((void**)&cu_ru.d_data_wCP, fftsize*(nb_symbols+1)*nb_tx*sizeof(int)) ); gpuErrchk( cudaMalloc((void**)&cu_ru.d_data_wCP, fftsize*(nb_symbols+1)*nb_tx*sizeof(int)) );
cufftErrchk( cufftPlan1d(&cu_ru.plan, fftsize, CUFFT_C2C, nb_symbols*nb_tx) ); cufftErrchk( cufftPlan1d(&cu_ru.plan, fftsize, CUFFT_C2C, nb_symbols*nb_tx) );
int portSize = fftsize*nb_symbols;
int subtxsize = nb_tx * nb_symbols * fftsize;
int bw_psize = nb_tx * fftsize;
gpuErrchk( cudaMemcpyToSymbol(PORTSIZE, &portSize, sizeof(int)) );
gpuErrchk( cudaMemcpyToSymbol(SUBTXSIZE, &subtxsize, sizeof(int)) );
gpuErrchk( cudaMemcpyToSymbol(BW_PSIZE, &bw_psize, sizeof(int)) );
} }
...@@ -131,6 +131,12 @@ void CUDA_prec_ofdm(RU_t *ru,int frame_tx,int tti_tx){ ...@@ -131,6 +131,12 @@ void CUDA_prec_ofdm(RU_t *ru,int frame_tx,int tti_tx){
((short*)&ru->common.txdataF[p][j])[0] = 1; ((short*)&ru->common.txdataF[p][j])[0] = 1;
((short*)&ru->common.txdataF[p][j])[1] = 1; ((short*)&ru->common.txdataF[p][j])[1] = 1;
} }
for(int aa=0; aa<ru->nb_tx; aa++){
for(int j=0; j<fp->ofdm_symbol_size; j++){
((short*)&ru->beam_weights[0][aa][p][j])[0] = 2;
((short*)&ru->beam_weights[0][aa][p][j])[1] = 2;
}
}
} }
clock_t start, end; clock_t start, end;
start = clock(); start = clock();
......
Markdown is supported
0%
or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment