v3. decrease GMEM access in beamforming precode

21c5e485 · tyhsu · 6a271962 · 21c5e485 · 21c5e485 · 21c5e485
Commit 21c5e485 authored Nov 21, 2019 by tyhsu
4 changed files
--- a/openair1/CUDA/CUDA_phy_procedure.cu
+++ b/openair1/CUDA/CUDA_phy_procedure.cu
@@ -116,7 +116,7 @@ extern "C" void CUDA_ifft_ofdm( int **output,
 	int threadNum = 1024;
 	int blockNum = fftsize*nb_symbols*nb_tx / threadNum;
-	cu_intToComplex<<<blockNum, threadNum>>>(d_txdataF_BF, d_signal);
+	//cu_intToComplex<<<blockNum, threadNum>>>(d_txdataF_BF, d_signal);
 	//CHECK_STATE("cu_intToComplex");
 	cufftErrchk( cufftExecC2C(plan, (cufftComplex *)d_signal, (cufftComplex *)d_signal, CUFFT_INVERSE));
@@ -144,70 +144,78 @@ extern "C" void CUDA_ifft_ofdm( int **output,
 }
 __device__ inline void beamComp(int *res, int *x1, int *x2){
-	((short*)res)[0] = ((short*)x1)[0]*((short*)x2)[0] + ((short*)x1)[1]*((short*)x2)[1];
+	((short*)res)[0] += ((short*)x1)[0]*((short*)x2)[0] + ((short*)x1)[1]*((short*)x2)[1];
-	((short*)res)[1] = ((short*)x1)[0]*((short*)x2)[1] - ((short*)x1)[1]*((short*)x2)[0];
+	((short*)res)[1] += ((short*)x1)[0]*((short*)x2)[1] - ((short*)x1)[1]*((short*)x2)[0];
 }
-__global__ void	conjMulAll(int* txdataF, int* weight, int* res,
+extern __constant__ int PORTSIZE;
+extern __constant__ int SUBTXSIZE;
+extern __constant__ int BW_PSIZE;
+__global__ void	conjMulAll(int* txdataF, int* weight, int* sub,
 		int fftsize, int nb_symbols, int nb_tx, int nb_antenna_ports){
-	__shared__ int x1[2048*5];
+	__shared__ int x1[2048*4];
-	int symbSart = blockIdx.x*5;
+	__shared__ int res[2048];
-	int portId = blockIdx.y;
 	int id = threadIdx.x;
 	int id2 = id+1024;
-	int aaSize = nb_antenna_ports*nb_symbols*fftsize;
+	int symbId = blockIdx.x;
-	int portSize = nb_symbols*fftsize;
+	int portStart = blockIdx.y*4;
+	int subtxId = blockIdx.y;
-	int s1=0;
-	for(int symbId=symbSart; symbId<(symbSart+5)&&symbId<nb_symbols; symbId++){
+	int s1 = 0;
-		x1[s1*fftsize+id] = txdataF[symbId*fftsize+id];
+	for(int p=portStart; p<portStart+4; p++){
-		x1[s1*fftsize+id2] = txdataF[symbId*fftsize+id2];
+		x1[s1*2048+id] = txdataF[p*PORTSIZE + symbId*fftsize + id];
+		x1[s1*2048+id2] = txdataF[p*PORTSIZE + symbId*fftsize + id2];
 		s1++;
 	}
 	for(int aa=0; aa<nb_tx; aa++){
-		for(int symbId=symbSart; symbId<(symbSart+5)&&symbId<nb_symbols; symbId++){
+		res[id] = 0;
-			int resId = aa*aaSize+portId*portSize+symbId*fftsize;
+		res[id2] = 0;
-			s1 = symbId%5;
+		s1 = 0;
-			beamComp(&res[resId+id], &x1[s1*fftsize+id], &weight[portId*(nb_tx*fftsize)+aa*fftsize+id]);
+		for(int p=portStart; p<portStart+4; p++){
-			beamComp(&res[resId+id2], &x1[s1*fftsize+id2], &weight[portId*(nb_tx*fftsize)+aa*fftsize+id2]);
+			beamComp(&res[id], &x1[s1*2048+id], &weight[p*BW_PSIZE+aa*fftsize+id]);
-		}
+			beamComp(&res[id2], &x1[s1*2048+id2], &weight[p*BW_PSIZE+aa*fftsize+id2]);
+			/*
+			if(id==0){
+				printf("%5d+%5di mul %5d+%5di = %5d+%5di\n", 
+						((short*)&x1[s1*2048+id])[0], ((short*)&x1[s1*2048+id])[1],
+						((short*)&weight[p*BW_PSIZE+aa*fftsize+id])[0],((short*)&weight[p*BW_PSIZE+aa*fftsize+id])[1],
+						((short*)&res[id])[0], ((short*)&res[id])[1]);
+			}*/
+			s1++;
 		}
+		int offset = subtxId*SUBTXSIZE + aa*PORTSIZE + symbId*fftsize;
+		sub[offset+id] = res[id];
+		sub[offset+id2] = res[id2];
+	} 	
 }
-__device__ inline void partAdd(int *res, int *x){
+__device__ inline void partAdd(Complex *res, int *x1, int *x2){
-	((short*)res)[0] += ((short*)x)[0];
+	res->x = ((short*)x1)[0] + ((short*)x2)[0];
-	((short*)res)[1] += ((short*)x)[1];
+	res->y = ((short*)x1)[1] + ((short*)x2)[1];
 }
-__global__ void	combine(int* res, int* txdataF_BF, int fftsize, int nb_symbols, int nb_tx, int nb_antenna_ports){
+__global__ void	combine(int* subtx, Complex* d_signal, int fftsize, int nb_symbols, int nb_tx, int nb_antenna_ports){
-	__shared__ int buf[2048*5];
-	int symbStart = blockIdx.x;
-	int txId = blockIdx.y;
 	int id = threadIdx.x;
 	int id2 = id+1024;
-	int txSize = nb_antenna_ports*nb_symbols*fftsize;
+	int aa = blockIdx.x;
-	int portSize = nb_symbols*fftsize;
+	int symbStart = blockIdx.y*7;
+	int symbEnd = symbStart + 7;
-	int s1=0;
+	for(int symb=symbStart; symb<symbEnd; symb++){
-	for(int p=0; p<nb_antenna_ports; p++){
+		int offset = aa*PORTSIZE + symb*fftsize;
-		for(int symbId=symbStart; symbId<symbId+5&&symbId<nb_symbols; symbId++){
+		partAdd(&d_signal[offset+id], &subtx[offset+id], &subtx[SUBTXSIZE+offset+id]);
-			s1 = symbId%5;
+		partAdd(&d_signal[offset+id2], &subtx[offset+id2], &subtx[SUBTXSIZE+offset+id2]);
-			partAdd(&buf[s1+id], &res[txId*txSize+p*portSize+symbId*fftsize+id]);
+		//if(id==0) printf("%5.5f+%5.5fi\n", d_signal[offset+id].x, d_signal[offset+id].y);
-			partAdd(&buf[s1+id2], &res[txId*txSize+p*portSize+symbId*fftsize+id2]);
-		}
-	}
-	for(int symbId=symbStart; symbId<symbId+5&&symbId<nb_symbols; symbId++){
-		s1 = symbId%5;
-		res[txId*nb_symbols*fftsize+symbId*fftsize+id] = buf[s1*fftsize+id];
-		res[txId*nb_symbols*fftsize+symbId*fftsize+id2] = buf[s1*fftsize+id2];
 	}
 }
 extern "C" void CUDA_beam_precoding(int **txdataF, int ***weight, int L_ssb, int shift, int fftsize, int nb_symbols, int nb_antenna_ports, int nb_tx){
 	cudaEvent_t start, stop;
 	float time;
@@ -216,6 +224,7 @@ extern "C" void CUDA_beam_precoding(int **txdataF, int ***weight, int L_ssb, int
 	//initial BF data;
 	gpuErrchk( cudaMemset(cu_ru.d_txdataF_BF, 0, fftsize*nb_symbols*sizeof(int)*nb_tx) );
+	gpuErrchk( cudaMemset(cu_ru.d_subtx, 0, fftsize*nb_symbols*nb_tx*2*sizeof(int)) );
 	//move data to gpu
 	int slotsize = fftsize*nb_symbols;
 	for(int p=0; p<nb_antenna_ports; p++){
@@ -223,8 +232,6 @@ extern "C" void CUDA_beam_precoding(int **txdataF, int ***weight, int L_ssb, int
 	}
 	cudaEventRecord(start);
-	int threadNum = 1024;
-	int blockNum = fftsize*nb_symbols/threadNum;
 	int div = 1<<shift;
 	for(int aa=0; aa<nb_tx; aa++){
 		for(int p=0; p<nb_antenna_ports; p++){
@@ -241,11 +248,12 @@ extern "C" void CUDA_beam_precoding(int **txdataF, int ***weight, int L_ssb, int
 	cudaEventRecord(start);
-	dim3 block(3,8);
+	dim3 block(14,2,1);
 	dim3 thread(1024);
-	conjMulAll<<<block, thread>>>(cu_ru.d_txdataF, cu_ru.d_weight, cu_ru.d_res,
+	conjMulAll<<<block, thread>>>(cu_ru.d_txdataF, cu_ru.d_weight, cu_ru.d_subtx,
 		fftsize, nb_symbols, nb_tx, nb_antenna_ports);
-	combine<<<block, thread>>>(cu_ru.d_res, cu_ru.d_txdataF_BF, 
+	block = dim3(8, 2, 1);
+	combine<<<block, thread>>>(cu_ru.d_subtx, cu_ru.d_signal, 
 		fftsize, nb_symbols, nb_tx, nb_antenna_ports);
 	cudaEventRecord(stop);

--- a/openair1/CUDA/cuda_struct.h
+++ b/openair1/CUDA/cuda_struct.h
@@ -15,7 +15,7 @@ typedef struct cuda_cu_ru_t{
 	//beamforming precoding
 	int *d_txdataF;//14symb-port0, 14symb-port1, ......
 	int *d_weight;//[p * tx * fftsize]
-	int *d_res;
+	int *d_subtx;//14symb-subport0, 14symb-subport1, ..., 14symb-subport0, 14symb-subport1, ...
 	//ifft
 	int *d_txdataF_BF;//14symb-tx0, 14symb-tx1, ......
@@ -26,6 +26,7 @@ typedef struct cuda_cu_ru_t{
 extern cuda_cu_ru cu_ru;
 #if __cplusplus
 }
 #endif

--- a/openair1/CUDA/init_cuda.cu
+++ b/openair1/CUDA/init_cuda.cu
@@ -41,6 +41,10 @@
 cuda_cu_ru cu_ru;
+__constant__ int PORTSIZE;
+__constant__ int SUBTXSIZE;
+__constant__ int BW_PSIZE;
 extern "C" void init_cuda(int nb_tx, int nb_symbols, int fftsize){
 	printf("init_cuda %d %d %d  \n\n\n", nb_tx, nb_symbols, fftsize);
@@ -49,7 +53,7 @@ extern "C" void init_cuda(int nb_tx, int nb_symbols, int fftsize){
 	//beamforming precoding
 	gpuErrchk( cudaMalloc((void**)&cu_ru.d_txdataF, sizeof(int) * nb_tx*nb_antenna_ports*nb_symbols*fftsize) );
 	gpuErrchk( cudaMalloc((void**)&cu_ru.d_weight, sizeof(int) * nb_tx*nb_antenna_ports*fftsize) );
-	gpuErrchk( cudaMalloc((void**)&cu_ru.d_res, sizeof(int) * nb_tx*nb_antenna_ports*fftsize*nb_symbols) );
+	gpuErrchk( cudaMalloc((void**)&cu_ru.d_subtx, sizeof(int) * nb_tx*fftsize*nb_symbols*2) );
 	//ifft	
 	gpuErrchk( cudaMalloc((void**)&cu_ru.d_txdataF_BF, fftsize*sizeof(int)*nb_symbols*nb_tx) );
@@ -57,6 +61,11 @@ extern "C" void init_cuda(int nb_tx, int nb_symbols, int fftsize){
 	gpuErrchk( cudaMalloc((void**)&cu_ru.d_data_wCP, fftsize*(nb_symbols+1)*nb_tx*sizeof(int)) );
 	cufftErrchk( cufftPlan1d(&cu_ru.plan, fftsize, CUFFT_C2C, nb_symbols*nb_tx) );
+	int portSize  = fftsize*nb_symbols;
+	int subtxsize = nb_tx * nb_symbols * fftsize;
+	int bw_psize = nb_tx * fftsize;
+	gpuErrchk( cudaMemcpyToSymbol(PORTSIZE, &portSize, sizeof(int)) );
+	gpuErrchk( cudaMemcpyToSymbol(SUBTXSIZE, &subtxsize, sizeof(int)) );
+	gpuErrchk( cudaMemcpyToSymbol(BW_PSIZE, &bw_psize, sizeof(int)) );
 }
--- a/openair1/SCHED_NR/nr_ru_procedures.c
+++ b/openair1/SCHED_NR/nr_ru_procedures.c
@@ -131,6 +131,12 @@ void CUDA_prec_ofdm(RU_t *ru,int frame_tx,int tti_tx){
 			((short*)&ru->common.txdataF[p][j])[0] = 1;
 			((short*)&ru->common.txdataF[p][j])[1] = 1;
 		}
+		for(int aa=0; aa<ru->nb_tx; aa++){
+			for(int j=0; j<fp->ofdm_symbol_size; j++){
+				((short*)&ru->beam_weights[0][aa][p][j])[0] = 2;
+				((short*)&ru->beam_weights[0][aa][p][j])[1] = 2;
+			}
+		}
    }
 	clock_t start, end;	
 	start = clock();