v1. move cpu function to gpu

b76e487d · tyhsu · fdc78bfb · b76e487d · b76e487d · b76e487d
Commit b76e487d authored Nov 15, 2019 by tyhsu
13 changed files
--- a/cmake_targets/CMakeLists.txt
+++ b/cmake_targets/CMakeLists.txt
@@ -52,9 +52,7 @@ set(CMAKE_CUDA_FLAGS "${CMAKE_CUDA_FLAGS} -I${OPENAIR1_DIR}/ ")

 ### ADD CUDA LIBRARY
 CUDA_ADD_LIBRARY(PHY_CU ${OPENAIR1_DIR}/CUDA/CUDA_phy_procedure.cu
-	${OPENAIR1_DIR}/CUDA/init_cuda.cu
 	${OPENAIR1_DIR}/CUDA/struct.h 
-	${OPENAIR1_DIR}/CUDA/cuda_struct.h
 	${OPENAIR1_DIR}/CUDA/checkError.h 
 	)


--- a/executables/nr-ru.c
+++ b/executables/nr-ru.c
@@ -2033,10 +2033,8 @@ void set_function_spec_param(RU_t *ru) {
      } else if (ru->function == gNodeB_3GPP) {
        ru->do_prach             = 0;                       // no prach processing in RU
        ru->feprx                = (get_thread_worker_conf() == WORKER_ENABLE) ? ru_fep_full_2thread : fep_full;                // RX DFTs
-        //ru->feptx_ofdm           = (get_thread_worker_conf() == WORKER_ENABLE) ? nr_feptx_ofdm_2thread : nr_feptx_ofdm;              // this is fep with idft and precoding
-        ru->feptx_ofdm           = CUDA_prec_ofdm;              // this is fep with idft and precoding
-        //ru->feptx_prec           = nr_feptx_prec;           // this is fep with idft and precoding
-        ru->feptx_prec           = NULL;           // this is fep with idft and precoding
+        ru->feptx_ofdm           = (get_thread_worker_conf() == WORKER_ENABLE) ? nr_feptx_ofdm_2thread : nr_feptx_ofdm;              // this is fep with idft and precoding
+        ru->feptx_prec           = nr_feptx_prec;           // this is fep with idft and precoding
        ru->fh_north_in          = NULL;                    // no incoming fronthaul from north
        ru->fh_north_out         = NULL;                    // no outgoing fronthaul to north
        ru->nr_start_if          = NULL;                    // no if interface

--- a/openair1/CUDA/CUDA_phy_procedure.cu
+++ b/openair1/CUDA/CUDA_phy_procedure.cu
-/*
- * Licensed to the OpenAirInterface (OAI) Software Alliance under one or more
- * contributor license agreements.  See the NOTICE file distributed with
- * this work for additional information regarding copyright ownership.
- * The OpenAirInterface Software Alliance licenses this file to You under
- * the OAI Public License, Version 1.1  (the "License"); you may not use this file
- * except in compliance with the License.
- * You may obtain a copy of the License at
- *
- *      http://www.openairinterface.org/?page_id=698
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- *-------------------------------------------------------------------------------
- * For more information about the OpenAirInterface (OAI) Software Alliance:
- *      contact@openairinterface.org
- */
-
- /*! \file CUDA_phy_procedure.cu
- * \brief Create and Implementation of beamforming and ifft in gpu
- * \author TY Hsu, CW Chang
- * \date 2018
- * \version 0.1
- * \company ISIP@NCTU and Eurecom
- * \email: tyhsu@cs.nctu.edu.tw, zhang0756107.cs07g@nctu.edu.tw
- * \note
- * \warning
- */
 #include <stdio.h>
 #include <stdlib.h>
 #include <math.h>
@@ -37,7 +6,7 @@
 #include <cufft.h>
 #include "CUDA/checkError.h"
 #include "CUDA/struct.h"
-#include "CUDA/cuda_struct.h"
+

 __global__ void cu_intToComplex(int *input, Complex *signal){
 	int id = blockIdx.x*1024 + threadIdx.x;
@@ -51,141 +20,141 @@ __global__ void cu_ComplexToInt(int *output, Complex *signal){
 	((short*)&output[id])[1] = round(signal[id].y);
 }

-__global__ void cu_CP_fft_resultin(unsigned char nb_prefix_samples, int *input, int *output, int fftsize, int nb_symbols){
-	int id = blockIdx.x*1024 + threadIdx.x;
-	int elementId = id%fftsize;
-	int symbolId = id/fftsize;
-	int slotId = symbolId/nb_symbols;
-	int symbIdinSlot = symbolId%nb_symbols;
-	int slotElmtNum = fftsize*(nb_symbols+1);
-	int CPElmtNum = fftsize+nb_prefix_samples;
-
-	int offset = slotId*slotElmtNum + symbIdinSlot*CPElmtNum;
-	output[offset + nb_prefix_samples + elementId] = input[id];
-	if(elementId >= fftsize-nb_prefix_samples){
-		output[offset + (fftsize-nb_prefix_samples)] = input[id];
-	}
-}
-
-__global__ void cu_CP0_fft_resultin(unsigned char nb_prefix_samples0, unsigned char nb_prefix_samples, 
-								int *input, int *output, int fftsize, int nb_symbols){
-	int id = blockIdx.x*1024 + threadIdx.x;
-	int elementId = id%fftsize;
-	int symbolId = id/fftsize;
-	int slotId = symbolId/nb_symbols;
-	int symbIdinSlot = symbolId%nb_symbols;
-	int slotElmtNum = fftsize*(nb_symbols+1);
-	int CP0ElmtNum = fftsize+nb_prefix_samples0;
-	int CPElmtNum = fftsize+nb_prefix_samples;
-
-	if(symbIdinSlot==0){
-		int offset = slotId*slotElmtNum;
-		output[offset + nb_prefix_samples0+ elementId] = input[id];
-		if(elementId >= fftsize-nb_prefix_samples0){
-			output[offset + (fftsize-nb_prefix_samples0)] = input[id];
-		} 
-	}else{
-		int offset = slotId*slotElmtNum + CP0ElmtNum + (symbIdinSlot-1)*CPElmtNum;
-		output[offset + nb_prefix_samples + elementId] = input[id];
-		if(elementId >= fftsize-nb_prefix_samples){
-			output[offset + (fftsize-nb_prefix_samples)] = input[id];
-		}
-	}
-}
-
-extern "C" void CUDA_ifft_ofdm( int **output, 
+extern "C" void CUDA_PHY_ofdm_mod(int *input, 
+				int *output, 
 				int fftsize, 
 				unsigned char nb_symbols, 
-				unsigned char nb_prefix_samples,
-				unsigned char nb_prefix_samples0,
-				int nb_tx,
-				int Ncp,
+				unsigned short nb_prefix_samples, 
 				Extension_t etype){
-	//for(int i=0; i<fftsize; i++) printf("%d+%di\n", ((short*)&input[0][i])[0], ((short*)&input[0][i])[1]);
+	if(nb_symbols==0) return ;

-	int *d_txdataF_BF = cu_ru.d_txdataF_BF;
-	int *d_data_wCP = cu_ru.d_data_wCP;
-	Complex *d_signal = cu_ru.d_signal;
-	cufftHandle plan = cu_ru.plan; 

-	/*
-	for(int aa=0; aa<nb_tx; aa++){
-		int elementNum = fftsize*nb_symbols;
-		gpuErrchk( cudaMemcpy(&d_data[aa*elementNum], input[aa], sizeof(int)*elementNum, cudaMemcpyHostToDevice) );
-	}*/
+	Complex *d_signal;
+	gpuErrchk( cudaMalloc((void**)&d_signal, fftsize*sizeof(Complex)*nb_symbols) );
+
+	int *d_data;
+	gpuErrchk( cudaMalloc((void**)&d_data, fftsize*sizeof(int)*nb_symbols) );
+	gpuErrchk( cudaMemcpy(d_data, input, fftsize*sizeof(int)*nb_symbols, cudaMemcpyHostToDevice) );

 	int threadNum = 1024;
-	int blockNum = fftsize*nb_symbols*nb_tx / threadNum;
-	cu_intToComplex<<<blockNum, threadNum>>>(d_txdataF_BF, d_signal);
-	//CHECK_STATE("cu_intToComplex");
+	int blockNum = fftsize*nb_symbols/threadNum;
+	cu_intToComplex<<<blockNum, threadNum>>>(d_data, d_signal);
 	
-	cufftErrchk( cufftExecC2C(plan, (cufftComplex *)d_signal, (cufftComplex *)d_signal, CUFFT_INVERSE));

-	cu_ComplexToInt<<<blockNum, threadNum>>>(d_txdataF_BF, d_signal);
-	//CHECK_STATE("cu_ComplexToInt");
+	cufftHandle plan;
+	cufftErrchk( cufftPlan1d(&plan, fftsize, CUFFT_C2C, nb_symbols) );
+	cufftErrchk( cufftExecC2C(plan, (cufftComplex *)d_signal, (cufftComplex *)d_signal, CUFFT_FORWARD) );

+	Complex *h_output = (Complex*)malloc(fftsize*sizeof(Complex)*nb_symbols) ;
+	gpuErrchk( cudaMemcpy(h_output, d_signal, fftsize*sizeof(Complex)*nb_symbols, cudaMemcpyDeviceToHost) );

-	//only do cyclic_prefix, suffix/none not finish yet.
-	if(Ncp==1){
-		cu_CP_fft_resultin<<<blockNum, threadNum>>>(nb_prefix_samples, d_txdataF_BF, d_data_wCP, fftsize, nb_symbols);
-		//CHECK_STATE("cu_CP_fft_resultin");	
-	}else{
-		cu_CP0_fft_resultin<<<blockNum, threadNum>>>(nb_prefix_samples0, nb_prefix_samples, d_txdataF_BF, d_data_wCP, fftsize, nb_symbols);
-		//CHECK_STATE("cu_CP0_fft_resultin");	
+	cu_ComplexToInt<<<blockNum, threadNum>>>(d_data, d_signal);
+
+	int *res = (int*)malloc(fftsize*sizeof(int)*nb_symbols);
+	gpuErrchk( cudaMemcpy(res, d_data, fftsize*sizeof(int)*nb_symbols, cudaMemcpyDeviceToHost) );
+
+/*	
+	for(int i=0; i<fftsize*nb_symbols; i++){
+		printf("res(%d) %d+%di\n", i, ((short*)&res[i])[0], ((short*)&res[i])[1]);
+	}*/
+
+	for(int symb_th=0; symb_th<nb_symbols; symb_th++){
+		int* output_ptr;
+		switch(etype){
+			case CYCLIC_PREFIX:{
+				output_ptr = &output[symb_th*fftsize + (1+symb_th)*nb_prefix_samples];
+				memcpy(output_ptr, res, fftsize<<2);
+
+				int j=fftsize;
+				for(int k=-1; k>=-nb_prefix_samples; k--){
+					output_ptr[k] = output_ptr[--j];
+				}
+				break;
 			}
+			case CYCLIC_SUFFIX:{
+				output_ptr = &output[symb_th*fftsize + (symb_th)*nb_prefix_samples];
+				memcpy(output_ptr, res, fftsize<<2);

-	//write back gpu->cpu
-	for(int aa=0; aa<nb_tx; aa++){
-		gpuErrchk( cudaMemcpy(output[aa], &d_data_wCP[aa*(nb_symbols+1)*fftsize], fftsize*(nb_symbols+1)*sizeof(int), cudaMemcpyDeviceToHost) );
+				for(int k=0; k<nb_prefix_samples; k++){
+					output_ptr[fftsize+k] = output_ptr[k];
+				}
+				break;
+			}
+			case ZEROS:{
+				break;
+			}
+			case NONE:{
+				output_ptr = &output[fftsize];
+				memcpy(output_ptr, res, fftsize<<2);
+				break;
 			}
-	cudaDeviceSynchronize();

+			default:{
+				break;
+			}
+
+		}

+		cufftDestroy(plan);
+		free(h_output);
+		free(res);
+		cudaFree(d_signal);
+	}
 }

-__global__ void conjMul(int *d_x1, int *d_x2, int *d_y, int aa, int div, int fftsize, int nb_symbols){
+__global__ void conjMul(int *d_x1, int *d_x2, int *d_y, short zero_flag,unsigned int div){
 	int id = blockIdx.x*1024 + threadIdx.x;
 	int *x1 = &d_x1[id];
-	int *x2 = &d_x2[id%fftsize];
-	int *y = &d_y[aa*fftsize*nb_symbols + id];
+	int *x2 = &d_x2[id];
+	int *y = &d_y[id];

 	int re, im;
-	//conj(x1) * x2
-	re = ((short*)x1)[0]*((short*)x2)[0] + ((short*)x1)[1]*((short*)x2)[1];
-	im = ((short*)x1)[0]*((short*)x2)[1] - ((short*)x1)[1]*((short*)x2)[0];
+
+	((short*)x1)[1] *= -1;
+	re = ((short*)x1)[0]*((short*)x2)[0] - ((short*)x1)[1]*((short*)x2)[1];
+	im = ((short*)x1)[1]*((short*)x2)[0] + ((short*)x1)[0]*((short*)x2)[1];

 	re = re / div;
 	im = im / div;

+	if(zero_flag){
+		((short*)y)[0] = re; 
+		((short*)y)[1] = im;
+	}else{
 		((short*)y)[0] += re; 
 		((short*)y)[1] += im;	
+	}

 }

+extern "C" void CUDA_multadd_cpx_vector(int *x1, int *x2, int *y, short zero_flag,unsigned int N, int output_shift){
+	int *d_x1, *d_x2, *d_y;
+ 	gpuErrchk( cudaMalloc((void**)&d_x1, N*sizeof(int)) );
+ 	gpuErrchk( cudaMalloc((void**)&d_x2, N*sizeof(int)) );
+ 	gpuErrchk( cudaMalloc((void**)&d_y, N*sizeof(int)) );

-extern "C" void CUDA_beam_precoding(int **txdataF, int ***weight, int L_ssb, int shift, int fftsize, int nb_symbols, int nb_antenna_ports, int nb_tx){
+	gpuErrchk( cudaMemcpy(d_x1, x1, N*sizeof(int), cudaMemcpyHostToDevice) );
+	gpuErrchk( cudaMemcpy(d_x2, x2, N*sizeof(int), cudaMemcpyHostToDevice) );

-	//initial BF data;
-	gpuErrchk( cudaMemset(cu_ru.d_txdataF_BF, 0, fftsize*nb_symbols*sizeof(int)*nb_tx) );
-	//move data to gpu
-	for(int p=0; p<nb_antenna_ports; p++){
-		gpuErrchk( cudaMemcpy(cu_ru.d_txdataF[p], txdataF[p], fftsize*sizeof(int)*nb_symbols, cudaMemcpyHostToDevice) );	
+	if(zero_flag == 0){
+		gpuErrchk( cudaMemcpy(d_y, y, N*sizeof(int), cudaMemcpyHostToDevice) );	
+	}else{
+		gpuErrchk( cudaMemset(d_y, 0, N*sizeof(int)) );
 	}

+	unsigned int div = 1;
+	div = div << output_shift;

 	int threadNum = 1024;
-	int blockNum = fftsize*nb_symbols/threadNum;
-	int div = 1<<shift;
-	for(int aa=0; aa<nb_tx; aa++){
-		for(int p=0; p<nb_antenna_ports; p++){
-			if((L_ssb>>p) & 0x01){
-				gpuErrchk( cudaMemcpy(cu_ru.d_weight[p][aa], weight[p][aa], fftsize*sizeof(int), cudaMemcpyHostToDevice) );
-				conjMul<<<blockNum, threadNum>>>(cu_ru.d_txdataF[p], cu_ru.d_weight[p][aa], 
-							cu_ru.d_txdataF_BF, aa, div, fftsize, nb_symbols);
-			}
-		}
-	}
+	int blockNum = N / threadNum;	
+	conjMul<<<blockNum, threadNum>>>(d_x1, d_x2, d_y, zero_flag, div);
+	cudaDeviceSynchronize();
+	CHECK_STATE("conjMul");

+	gpuErrchk( cudaMemcpy(y, d_y, N*sizeof(int), cudaMemcpyDeviceToHost) );

-}

+	cudaFree(d_x1);
+	cudaFree(d_x2);
+	cudaFree(d_y);
+}
--- a/openair1/CUDA/CUDA_phy_procedure_def.h
+++ b/openair1/CUDA/CUDA_phy_procedure_def.h
 #ifndef CUDA
 #define CUDA

-#include "cuda_struct.h"

 #if __cplusplus
 extern "C" {
 #endif

 void CUDA_hello(void);
-void CUDA_ifft_ofdm( int **output, 
+void CUDA_PHY_ofdm_mod(int *input, 
+				int *output, 
 				int fftsize, 
 				unsigned char nb_symbols, 
-				unsigned char nb_prefix_samples,
-				unsigned char nb_prefix_samples0,
-				int nb_tx,
-				int Ncp,
+				unsigned short nb_prefix_samples, 
 				Extension_t etype);
-void CUDA_beam_precoding(int **txdataF, int ***weight, int L_ssb, int shift, int fftsize, int nb_symbols, int nb_antenna_ports, int nb_tx);
+void CUDA_multadd_cpx_vector(int* x1, int *x2, int *y, short zero_flag, unsigned int N, int output_shift);

 #if __cplusplus
 }

--- a/openair1/CUDA/checkError.h
+++ b/openair1/CUDA/checkError.h
-#ifndef CHECKERROR_H
-#define CHECKERROR_H
+__global__ void gpu_hello(void){
+	printf("Hello world from GPU!\n");
+}
+
+extern "C" void CUDA_hello(void){
+	printf("ready to gpu_hello\n");
+	gpu_hello<<<1,1>>>();
+	cudaDeviceSynchronize();
+}

 static const char* _cudaGetErrorEnum(cufftResult error){
 	switch (error){
@@ -36,7 +43,6 @@ inline void gpuAssert(cudaError_t code, const char *file, int line, bool abort=t

 #define CHECK_STATE(msg) {checkCudaState((msg), __FILE__, __LINE__);}
 inline void checkCudaState(const char *msg, const char *file, const int line){
-	cudaDeviceSynchronize();
 	cudaError_t err = cudaGetLastError();
 	if(err != cudaSuccess) {
 		fprintf(stderr, "[%s]gpu error: %s %s %d\n", msg, cudaGetErrorString(err), file, line);
@@ -49,4 +55,4 @@ inline void checkCudaState(const char *msg, const char *file, const int line){



-#endif
+
--- a/openair1/CUDA/cuda_struct.h
+++ b/openair1/CUDA/cuda_struct.h
-#ifndef CUDA_STRUCT_H
-#define CUDA_STRUCT_H
-
-#include <cuda.h>
-#include <cuda_runtime.h>
-#include <cufft.h>
-
-#if __cplusplus
-extern "C" {
-#endif
-
-typedef float2 Complex;
-
-typedef struct cuda_cu_ru_t{
-	//beamforming precoding
-	int **d_txdataF;//14symb-port0, 14symb-port1, ......
-	int ***d_weight;//[p][tx][symb]
-	cudaStream_t *d_beam_stream;
-
-	//ifft
-	int *d_txdataF_BF;//14symb-tx0, 14symb-tx1, ......
-	Complex *d_signal;
-	int *d_data_wCP;
-	cufftHandle plan;
-}cuda_cu_ru;
-extern cuda_cu_ru cu_ru;
-
-
-#if __cplusplus
-}
-#endif
-
-#endif
--- a/openair1/CUDA/init_cuda.cu
+++ b/openair1/CUDA/init_cuda.cu
-/*
- * Licensed to the OpenAirInterface (OAI) Software Alliance under one or more
- * contributor license agreements.  See the NOTICE file distributed with
- * this work for additional information regarding copyright ownership.
- * The OpenAirInterface Software Alliance licenses this file to You under
- * the OAI Public License, Version 1.1  (the "License"); you may not use this file
- * except in compliance with the License.
- * You may obtain a copy of the License at
- *
- *      http://www.openairinterface.org/?page_id=698
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- *-------------------------------------------------------------------------------
- * For more information about the OpenAirInterface (OAI) Software Alliance:
- *      contact@openairinterface.org
- */
-
- /*! \file init_cuda.cu
- * \brief Create and Implementation of beamforming and ifft in gpu
- * \author TY Hsu, CW Chang
- * \date 2018
- * \version 0.1
- * \company ISIP@NCTU and Eurecom
- * \email: tyhsu@cs.nctu.edu.tw, zhang0756107.cs07g@nctu.edu.tw
- * \note
- * \warning
- */
-#include <stdio.h>
-#include <stdlib.h>
-#include <math.h>
-#include <cuda.h>
-#include <cuda_runtime.h>
-#include <cufft.h>
-#include "CUDA/checkError.h"
-#include "CUDA/struct.h"
-#include "CUDA/cuda_struct.h"
-
-cuda_cu_ru cu_ru;
-
-extern "C" void init_cuda(int nb_tx, int nb_symbols, int fftsize){
-	printf("init_cuda %d %d %d  \n\n\n", nb_tx, nb_symbols, fftsize);
-
-	int nb_antenna_ports = 8;
-
-	//beamforming precoding
-	cu_ru.d_txdataF = (int**)malloc(sizeof(int*) * nb_antenna_ports);
-	for(int p=0; p<nb_antenna_ports; p++){
-		gpuErrchk( cudaMalloc((void**)&cu_ru.d_txdataF[p], fftsize*sizeof(int)*nb_symbols) );
-	}
-	cu_ru.d_beam_stream = (cudaStream_t*)malloc(sizeof(cudaStream_t)*nb_tx);
-	for(int aa=0; aa<nb_tx; aa++){
-		gpuErrchk( cudaStreamCreate(&cu_ru.d_beam_stream[aa]) );	
-	}
-
-
-	cu_ru.d_weight = (int***)malloc(sizeof(int**) * nb_antenna_ports);
-	for(int p=0; p<nb_antenna_ports; p++){
-		cu_ru.d_weight[p] = (int**)malloc(sizeof(int*) * nb_tx);
-		for(int aa=0; aa<nb_tx; aa++){
-			gpuErrchk( cudaMalloc((void**)&cu_ru.d_weight[p][aa], fftsize*sizeof(int)) );
-		}
-	}
-
-	//ifft	
-	gpuErrchk( cudaMalloc((void**)&cu_ru.d_txdataF_BF, fftsize*sizeof(int)*nb_symbols*nb_tx) );
-	gpuErrchk( cudaMalloc((void**)&cu_ru.d_signal, fftsize*sizeof(Complex)*nb_symbols*nb_tx) );
-	gpuErrchk( cudaMalloc((void**)&cu_ru.d_data_wCP, fftsize*(nb_symbols+1)*nb_tx*sizeof(int)) );
-	cufftErrchk( cufftPlan1d(&cu_ru.plan, fftsize, CUFFT_C2C, nb_symbols*nb_tx) );
-
-
-
-}
--- a/openair1/CUDA/init_cuda_def.h
+++ b/openair1/CUDA/init_cuda_def.h
 #ifndef INIT_CUDA_DEF
 #define INIT_CUDA_DEF

-#include "cuda_struct.h"
-
-#if __cplusplus
-extern "C" {
-#endif
+#include "struct.h"

+typedef cuda_ifft_t{
+	Complex *d_signal;
+	Complex *d_output;
+	int *d_data;	
+}cuda_ifft

-void init_cuda(int nb_tx, int nb_symbols, int fftsize);


-#if __cplusplus
-}
-#endif
-
 #endif
--- a/openair1/CUDA/struct.h
+++ b/openair1/CUDA/struct.h
-#ifndef OAI_STRUCT
-#define OAI_STRUCT
+#ifndef CUDA_STRUCT
+#define CUDA_STRUCT

+#include <cuda.h>
+#include <cuda_runtime.h>
+
+typedef float2 Complex;

 typedef enum {
 	  CYCLIC_PREFIX,

--- a/openair1/PHY/INIT/nr_init.c
+++ b/openair1/PHY/INIT/nr_init.c
@@ -33,7 +33,6 @@
 #include "LAYER2/MAC/mac_extern.h"
 #include "assertions.h"
 #include <math.h>
-#include "openair1/CUDA/init_cuda_def.h"

 #include "PHY/NR_TRANSPORT/nr_ulsch.h"
 #include "PHY/NR_REFSIG/nr_refsig.h"

--- a/openair1/PHY/INIT/nr_init_ru.c
+++ b/openair1/PHY/INIT/nr_init_ru.c
@@ -31,7 +31,6 @@
 #include "assertions.h"
 #include <math.h>
 #include "openair1/PHY/defs_RU.h"
-#include "openair1/CUDA/init_cuda_def.h"

 int nr_phy_init_RU(RU_t *ru) {

@@ -42,7 +41,6 @@ int nr_phy_init_RU(RU_t *ru) {

  LOG_I(PHY,"Initializing RU signal buffers (if_south %s) nb_tx %d\n",ru_if_types[ru->if_south],ru->nb_tx);

-  init_cuda(ru->nb_tx, fp->symbols_per_slot, fp->ofdm_symbol_size);

  if (ru->if_south <= REMOTE_IF5) { // this means REMOTE_IF5 or LOCAL_RF, so allocate memory for time-domain signals 
    // Time-domain signals

--- a/openair1/PHY/MODULATION/beamforming.c
+++ b/openair1/PHY/MODULATION/beamforming.c
@@ -52,6 +52,7 @@
 #include "modulation_eNB.h"
 #include "nr_modulation.h"
 #include "common/utils/LOG/vcd_signal_dumper.h"
+#include "CUDA/CUDA_phy_procedure_def.h"


 int beam_precoding(int32_t **txdataF,
@@ -165,9 +166,16 @@ int nr_beam_precoding(int32_t **txdataF,
 	}
  }	

+  void (*multadd_cpx_vector_ptr)(int*, int*, int*, short, unsigned int, int);
+#ifdef CUDA
+multadd_cpx_vector_ptr = CUDA_multadd_cpx_vector;
+#else
+multadd_cpx_vector_ptr = multadd_cpx_vector;
+#endif
+
  for (p=0; p<nb_antenna_ports; p++) {
    if ((frame_parms->L_ssb >> p) & 0x01)  {
-      multadd_cpx_vector((int16_t*)&txdataF[p][symbol*frame_parms->ofdm_symbol_size],
+      multadd_cpx_vector_ptr((int16_t*)&txdataF[p][symbol*frame_parms->ofdm_symbol_size],
 			 (int16_t*)beam_weights[p][aa], 
 			 (int16_t*)&txdataF_BF[aa][symbol*frame_parms->ofdm_symbol_size], 
 			 0, 

--- a/openair1/SCHED_NR/nr_ru_procedures.c
+++ b/openair1/SCHED_NR/nr_ru_procedures.c
@@ -77,8 +77,15 @@ void nr_feptx0(RU_t *ru,int tti_tx,int first_symbol, int num_symbols, int aa) {

  LOG_D(PHY,"SFN/SF:RU:TX:%d/%d Generating slot %d (first_symbol %d num_symbols %d)\n",ru->proc.frame_tx, ru->proc.tti_tx,slot,first_symbol,num_symbols);

+void (*PHY_ofdm_mod_ptr)(int*, int*, int, unsigned char, unsigned short, Extension_t);
+#ifdef CUDA
+PHY_ofdm_mod_ptr = CUDA_PHY_ofdm_mod;
+#else
+PHY_ofdm_mod_ptr = PHY_ofdm_mod;
+#endif
+
  if (fp->Ncp == 1) {
-    PHY_ofdm_mod(&ru->common.txdataF_BF[aa][slot_offsetF],
+    PHY_ofdm_mod_ptr(&ru->common.txdataF_BF[aa][slot_offsetF],
                 (int*)&ru->common.txdata[aa][slot_offset],
                 fp->ofdm_symbol_size,
                 num_symbols,
@@ -87,13 +94,13 @@ void nr_feptx0(RU_t *ru,int tti_tx,int first_symbol, int num_symbols, int aa) {
  }
  else {
    if (first_symbol==0) {
-      PHY_ofdm_mod(&ru->common.txdataF_BF[aa][slot_offsetF],
+      PHY_ofdm_mod_ptr(&ru->common.txdataF_BF[aa][slot_offsetF],
                   (int*)&ru->common.txdata[aa][slot_offset],
                   fp->ofdm_symbol_size,
                   1,
                   fp->nb_prefix_samples0,
                   CYCLIC_PREFIX);
-      PHY_ofdm_mod(&ru->common.txdataF_BF[aa][slot_offsetF+fp->ofdm_symbol_size],
+      PHY_ofdm_mod_ptr(&ru->common.txdataF_BF[aa][slot_offsetF+fp->ofdm_symbol_size],
                   (int*)&ru->common.txdata[aa][slot_offset+fp->nb_prefix_samples0+fp->ofdm_symbol_size],
                   fp->ofdm_symbol_size,
                   num_symbols-1,
@@ -101,7 +108,7 @@ void nr_feptx0(RU_t *ru,int tti_tx,int first_symbol, int num_symbols, int aa) {
                   CYCLIC_PREFIX);
    }
    else {
-      PHY_ofdm_mod(&ru->common.txdataF_BF[aa][slot_offsetF],
+      PHY_ofdm_mod_ptr(&ru->common.txdataF_BF[aa][slot_offsetF],
                   (int*)&ru->common.txdata[aa][slot_offset],
                   fp->ofdm_symbol_size,
                   num_symbols,
@@ -113,40 +120,8 @@ void nr_feptx0(RU_t *ru,int tti_tx,int first_symbol, int num_symbols, int aa) {
  //VCD_SIGNAL_DUMPER_DUMP_FUNCTION_BY_NAME(VCD_SIGNAL_DUMPER_FUNCTIONS_PHY_PROCEDURES_RU_FEPTX_OFDM+(first_symbol!=0?1:0), 0);
 }

-void CUDA_prec_ofdm(RU_t *ru,int frame_tx,int tti_tx){
-  	nfapi_nr_config_request_t *cfg = &ru->gNB_list[0]->gNB_config;
-	if(nr_slot_select(cfg, tti_tx) == SF_UL) return;
-
-	int slot = tti_tx;
-	NR_DL_FRAME_PARMS *fp = ru->nr_frame_parms;
-	PHY_VARS_gNB *gNB = ru->gNB_list[0];
-	int nb_antenna_ports = 8;
-
-	//data L1 to ru
-    for(int p=0; p<nb_antenna_ports; ++p){
-		memcpy((void*)ru->common.txdataF[p], (void*)&gNB->common_vars.txdataF[p],
-				fp->ofdm_symbol_size*sizeof(int32_t)*fp->symbols_per_slot);
-		//fake data
-		for(int j=0; j<fp->ofdm_symbol_size*fp->symbols_per_slot; j++){
-			((short*)&ru->common.txdataF[p][j])[0] = 1;
-			((short*)&ru->common.txdataF[p][j])[1] = 1;
-		}
-    }
-	
-	CUDA_beam_precoding((int**)ru->common.txdataF, (int***)ru->beam_weights[0], fp->L_ssb, 3,
-			fp->ofdm_symbol_size, fp->symbols_per_slot, nb_antenna_ports, ru->nb_tx);
-
-	CUDA_ifft_ofdm((int**)ru->common.txdata, 
-			 fp->ofdm_symbol_size, fp->symbols_per_slot, 
-			 fp->nb_prefix_samples, fp->nb_prefix_samples0, ru->nb_tx,
-			 fp->Ncp, CYCLIC_PREFIX);
-
-	
-
-}

 void nr_feptx_ofdm_2thread(RU_t *ru,int frame_tx,int tti_tx) {
-printf("nr_feptx_ofdm_2thread : frame_tx:%d tti_tx:%d\n", frame_tx, tti_tx);return; 

  nfapi_nr_config_request_t *cfg = &ru->gNB_list[0]->gNB_config;
  RU_proc_t  *proc  = &ru->proc;
@@ -245,7 +220,6 @@ printf("nr_feptx_ofdm_2thread : frame_tx:%d tti_tx:%d\n", frame_tx, tti_tx);retu
 }

 static void *nr_feptx_thread(void *param) {
-
  RU_feptx_t *feptx = (RU_feptx_t *)param;
  RU_t       *ru;
  int         aa, slot, start, l, nb_antenna_ports, ret;