v1. move cpu function to gpu

b76e487d · tyhsu · fdc78bfb · b76e487d · b76e487d · b76e487d
Commit b76e487d authored Nov 15, 2019 by tyhsu
13 changed files
--- a/cmake_targets/CMakeLists.txt
+++ b/cmake_targets/CMakeLists.txt
@@ -52,9 +52,7 @@ set(CMAKE_CUDA_FLAGS "${CMAKE_CUDA_FLAGS} -I${OPENAIR1_DIR}/ ")
 ### ADD CUDA LIBRARY
 CUDA_ADD_LIBRARY(PHY_CU ${OPENAIR1_DIR}/CUDA/CUDA_phy_procedure.cu
-	${OPENAIR1_DIR}/CUDA/init_cuda.cu
 	${OPENAIR1_DIR}/CUDA/struct.h 
-	${OPENAIR1_DIR}/CUDA/cuda_struct.h
 	${OPENAIR1_DIR}/CUDA/checkError.h 
 	)

--- a/executables/nr-ru.c
+++ b/executables/nr-ru.c
@@ -2033,10 +2033,8 @@ void set_function_spec_param(RU_t *ru) {
      } else if (ru->function == gNodeB_3GPP) {
        ru->do_prach             = 0;                       // no prach processing in RU
        ru->feprx                = (get_thread_worker_conf() == WORKER_ENABLE) ? ru_fep_full_2thread : fep_full;                // RX DFTs
-        //ru->feptx_ofdm           = (get_thread_worker_conf() == WORKER_ENABLE) ? nr_feptx_ofdm_2thread : nr_feptx_ofdm;              // this is fep with idft and precoding
+        ru->feptx_ofdm           = (get_thread_worker_conf() == WORKER_ENABLE) ? nr_feptx_ofdm_2thread : nr_feptx_ofdm;              // this is fep with idft and precoding
-        ru->feptx_ofdm           = CUDA_prec_ofdm;              // this is fep with idft and precoding
+        ru->feptx_prec           = nr_feptx_prec;           // this is fep with idft and precoding
-        //ru->feptx_prec           = nr_feptx_prec;           // this is fep with idft and precoding
-        ru->feptx_prec           = NULL;           // this is fep with idft and precoding
        ru->fh_north_in          = NULL;                    // no incoming fronthaul from north
        ru->fh_north_out         = NULL;                    // no outgoing fronthaul to north
        ru->nr_start_if          = NULL;                    // no if interface

--- a/openair1/CUDA/CUDA_phy_procedure.cu
+++ b/openair1/CUDA/CUDA_phy_procedure.cu
--- a/openair1/CUDA/CUDA_phy_procedure_def.h
+++ b/openair1/CUDA/CUDA_phy_procedure_def.h
 #ifndef CUDA
 #define CUDA
-#include "cuda_struct.h"
 #if __cplusplus
 extern "C" {
 #endif
 void CUDA_hello(void);
-void CUDA_ifft_ofdm( int **output, 
+void CUDA_PHY_ofdm_mod(int *input, 
+				int *output, 
 				int fftsize, 
 				unsigned char nb_symbols, 
-				unsigned char nb_prefix_samples,
+				unsigned short nb_prefix_samples, 
-				unsigned char nb_prefix_samples0,
-				int nb_tx,
-				int Ncp,
 				Extension_t etype);
-void CUDA_beam_precoding(int **txdataF, int ***weight, int L_ssb, int shift, int fftsize, int nb_symbols, int nb_antenna_ports, int nb_tx);
+void CUDA_multadd_cpx_vector(int* x1, int *x2, int *y, short zero_flag, unsigned int N, int output_shift);
 #if __cplusplus
 }

--- a/openair1/CUDA/checkError.h
+++ b/openair1/CUDA/checkError.h
-#ifndef CHECKERROR_H
+__global__ void gpu_hello(void){
-#define CHECKERROR_H
+	printf("Hello world from GPU!\n");
+}
+extern "C" void CUDA_hello(void){
+	printf("ready to gpu_hello\n");
+	gpu_hello<<<1,1>>>();
+	cudaDeviceSynchronize();
+}
 static const char* _cudaGetErrorEnum(cufftResult error){
 	switch (error){
@@ -36,7 +43,6 @@ inline void gpuAssert(cudaError_t code, const char *file, int line, bool abort=t
 #define CHECK_STATE(msg) {checkCudaState((msg), __FILE__, __LINE__);}
 inline void checkCudaState(const char *msg, const char *file, const int line){
-	cudaDeviceSynchronize();
 	cudaError_t err = cudaGetLastError();
 	if(err != cudaSuccess) {
 		fprintf(stderr, "[%s]gpu error: %s %s %d\n", msg, cudaGetErrorString(err), file, line);
@@ -49,4 +55,4 @@ inline void checkCudaState(const char *msg, const char *file, const int line){
-#endif
--- a/openair1/CUDA/cuda_struct.h
+++ b/openair1/CUDA/cuda_struct.h
-#ifndef CUDA_STRUCT_H
-#define CUDA_STRUCT_H
-#include <cuda.h>
-#include <cuda_runtime.h>
-#include <cufft.h>
-#if __cplusplus
-extern "C" {
-#endif
-typedef float2 Complex;
-typedef struct cuda_cu_ru_t{
-	//beamforming precoding
-	int **d_txdataF;//14symb-port0, 14symb-port1, ......
-	int ***d_weight;//[p][tx][symb]
-	cudaStream_t *d_beam_stream;
-	//ifft
-	int *d_txdataF_BF;//14symb-tx0, 14symb-tx1, ......
-	Complex *d_signal;
-	int *d_data_wCP;
-	cufftHandle plan;
-}cuda_cu_ru;
-extern cuda_cu_ru cu_ru;
-#if __cplusplus
-}
-#endif
-#endif
--- a/openair1/CUDA/init_cuda.cu
+++ b/openair1/CUDA/init_cuda.cu
-/*
- * Licensed to the OpenAirInterface (OAI) Software Alliance under one or more
- * contributor license agreements.  See the NOTICE file distributed with
- * this work for additional information regarding copyright ownership.
- * The OpenAirInterface Software Alliance licenses this file to You under
- * the OAI Public License, Version 1.1  (the "License"); you may not use this file
- * except in compliance with the License.
- * You may obtain a copy of the License at
- *
- *      http://www.openairinterface.org/?page_id=698
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- *-------------------------------------------------------------------------------
- * For more information about the OpenAirInterface (OAI) Software Alliance:
- *      contact@openairinterface.org
- */
- /*! \file init_cuda.cu
- * \brief Create and Implementation of beamforming and ifft in gpu
- * \author TY Hsu, CW Chang
- * \date 2018
- * \version 0.1
- * \company ISIP@NCTU and Eurecom
- * \email: tyhsu@cs.nctu.edu.tw, zhang0756107.cs07g@nctu.edu.tw
- * \note
- * \warning
- */
-#include <stdio.h>
-#include <stdlib.h>
-#include <math.h>
-#include <cuda.h>
-#include <cuda_runtime.h>
-#include <cufft.h>
-#include "CUDA/checkError.h"
-#include "CUDA/struct.h"
-#include "CUDA/cuda_struct.h"
-cuda_cu_ru cu_ru;
-extern "C" void init_cuda(int nb_tx, int nb_symbols, int fftsize){
-	printf("init_cuda %d %d %d  \n\n\n", nb_tx, nb_symbols, fftsize);
-	int nb_antenna_ports = 8;
-	//beamforming precoding
-	cu_ru.d_txdataF = (int**)malloc(sizeof(int*) * nb_antenna_ports);
-	for(int p=0; p<nb_antenna_ports; p++){
-		gpuErrchk( cudaMalloc((void**)&cu_ru.d_txdataF[p], fftsize*sizeof(int)*nb_symbols) );
-	}
-	cu_ru.d_beam_stream = (cudaStream_t*)malloc(sizeof(cudaStream_t)*nb_tx);
-	for(int aa=0; aa<nb_tx; aa++){
-		gpuErrchk( cudaStreamCreate(&cu_ru.d_beam_stream[aa]) );	
-	}
-	cu_ru.d_weight = (int***)malloc(sizeof(int**) * nb_antenna_ports);
-	for(int p=0; p<nb_antenna_ports; p++){
-		cu_ru.d_weight[p] = (int**)malloc(sizeof(int*) * nb_tx);
-		for(int aa=0; aa<nb_tx; aa++){
-			gpuErrchk( cudaMalloc((void**)&cu_ru.d_weight[p][aa], fftsize*sizeof(int)) );
-		}
-	}
-	//ifft	
-	gpuErrchk( cudaMalloc((void**)&cu_ru.d_txdataF_BF, fftsize*sizeof(int)*nb_symbols*nb_tx) );
-	gpuErrchk( cudaMalloc((void**)&cu_ru.d_signal, fftsize*sizeof(Complex)*nb_symbols*nb_tx) );
-	gpuErrchk( cudaMalloc((void**)&cu_ru.d_data_wCP, fftsize*(nb_symbols+1)*nb_tx*sizeof(int)) );
-	cufftErrchk( cufftPlan1d(&cu_ru.plan, fftsize, CUFFT_C2C, nb_symbols*nb_tx) );
-}
--- a/openair1/CUDA/init_cuda_def.h
+++ b/openair1/CUDA/init_cuda_def.h
 #ifndef INIT_CUDA_DEF
 #define INIT_CUDA_DEF
-#include "cuda_struct.h"
+#include "struct.h"
-#if __cplusplus
-extern "C" {
-#endif
+typedef cuda_ifft_t{
+	Complex *d_signal;
+	Complex *d_output;
+	int *d_data;	
+}cuda_ifft
-void init_cuda(int nb_tx, int nb_symbols, int fftsize);
-#if __cplusplus
-}
-#endif
 #endif
--- a/openair1/CUDA/struct.h
+++ b/openair1/CUDA/struct.h
-#ifndef OAI_STRUCT
+#ifndef CUDA_STRUCT
-#define OAI_STRUCT
+#define CUDA_STRUCT
+#include <cuda.h>
+#include <cuda_runtime.h>
+typedef float2 Complex;
 typedef enum {
 	  CYCLIC_PREFIX,

--- a/openair1/PHY/INIT/nr_init.c
+++ b/openair1/PHY/INIT/nr_init.c
@@ -33,7 +33,6 @@
 #include "LAYER2/MAC/mac_extern.h"
 #include "assertions.h"
 #include <math.h>
-#include "openair1/CUDA/init_cuda_def.h"
 #include "PHY/NR_TRANSPORT/nr_ulsch.h"
 #include "PHY/NR_REFSIG/nr_refsig.h"

--- a/openair1/PHY/INIT/nr_init_ru.c
+++ b/openair1/PHY/INIT/nr_init_ru.c
@@ -31,7 +31,6 @@
 #include "assertions.h"
 #include <math.h>
 #include "openair1/PHY/defs_RU.h"
-#include "openair1/CUDA/init_cuda_def.h"
 int nr_phy_init_RU(RU_t *ru) {
@@ -42,7 +41,6 @@ int nr_phy_init_RU(RU_t *ru) {
  LOG_I(PHY,"Initializing RU signal buffers (if_south %s) nb_tx %d\n",ru_if_types[ru->if_south],ru->nb_tx);
-  init_cuda(ru->nb_tx, fp->symbols_per_slot, fp->ofdm_symbol_size);
  if (ru->if_south <= REMOTE_IF5) { // this means REMOTE_IF5 or LOCAL_RF, so allocate memory for time-domain signals 
    // Time-domain signals

--- a/openair1/PHY/MODULATION/beamforming.c
+++ b/openair1/PHY/MODULATION/beamforming.c
@@ -52,6 +52,7 @@
 #include "modulation_eNB.h"
 #include "nr_modulation.h"
 #include "common/utils/LOG/vcd_signal_dumper.h"
+#include "CUDA/CUDA_phy_procedure_def.h"
 int beam_precoding(int32_t **txdataF,
@@ -165,9 +166,16 @@ int nr_beam_precoding(int32_t **txdataF,
 	}
  }	
+  void (*multadd_cpx_vector_ptr)(int*, int*, int*, short, unsigned int, int);
+#ifdef CUDA
+multadd_cpx_vector_ptr = CUDA_multadd_cpx_vector;
+#else
+multadd_cpx_vector_ptr = multadd_cpx_vector;
+#endif
  for (p=0; p<nb_antenna_ports; p++) {
    if ((frame_parms->L_ssb >> p) & 0x01)  {
-      multadd_cpx_vector((int16_t*)&txdataF[p][symbol*frame_parms->ofdm_symbol_size],
+      multadd_cpx_vector_ptr((int16_t*)&txdataF[p][symbol*frame_parms->ofdm_symbol_size],
 			 (int16_t*)beam_weights[p][aa], 
 			 (int16_t*)&txdataF_BF[aa][symbol*frame_parms->ofdm_symbol_size], 
 			 0, 

--- a/openair1/SCHED_NR/nr_ru_procedures.c
+++ b/openair1/SCHED_NR/nr_ru_procedures.c
@@ -77,8 +77,15 @@ void nr_feptx0(RU_t *ru,int tti_tx,int first_symbol, int num_symbols, int aa) {
  LOG_D(PHY,"SFN/SF:RU:TX:%d/%d Generating slot %d (first_symbol %d num_symbols %d)\n",ru->proc.frame_tx, ru->proc.tti_tx,slot,first_symbol,num_symbols);
+void (*PHY_ofdm_mod_ptr)(int*, int*, int, unsigned char, unsigned short, Extension_t);
+#ifdef CUDA
+PHY_ofdm_mod_ptr = CUDA_PHY_ofdm_mod;
+#else
+PHY_ofdm_mod_ptr = PHY_ofdm_mod;
+#endif
  if (fp->Ncp == 1) {
-    PHY_ofdm_mod(&ru->common.txdataF_BF[aa][slot_offsetF],
+    PHY_ofdm_mod_ptr(&ru->common.txdataF_BF[aa][slot_offsetF],
                 (int*)&ru->common.txdata[aa][slot_offset],
                 fp->ofdm_symbol_size,
                 num_symbols,
@@ -87,13 +94,13 @@ void nr_feptx0(RU_t *ru,int tti_tx,int first_symbol, int num_symbols, int aa) {
  }
  else {
    if (first_symbol==0) {
-      PHY_ofdm_mod(&ru->common.txdataF_BF[aa][slot_offsetF],
+      PHY_ofdm_mod_ptr(&ru->common.txdataF_BF[aa][slot_offsetF],
                   (int*)&ru->common.txdata[aa][slot_offset],
                   fp->ofdm_symbol_size,
                   1,
                   fp->nb_prefix_samples0,
                   CYCLIC_PREFIX);
-      PHY_ofdm_mod(&ru->common.txdataF_BF[aa][slot_offsetF+fp->ofdm_symbol_size],
+      PHY_ofdm_mod_ptr(&ru->common.txdataF_BF[aa][slot_offsetF+fp->ofdm_symbol_size],
                   (int*)&ru->common.txdata[aa][slot_offset+fp->nb_prefix_samples0+fp->ofdm_symbol_size],
                   fp->ofdm_symbol_size,
                   num_symbols-1,
@@ -101,7 +108,7 @@ void nr_feptx0(RU_t *ru,int tti_tx,int first_symbol, int num_symbols, int aa) {
                   CYCLIC_PREFIX);
    }
    else {
-      PHY_ofdm_mod(&ru->common.txdataF_BF[aa][slot_offsetF],
+      PHY_ofdm_mod_ptr(&ru->common.txdataF_BF[aa][slot_offsetF],
                   (int*)&ru->common.txdata[aa][slot_offset],
                   fp->ofdm_symbol_size,
                   num_symbols,
@@ -113,40 +120,8 @@ void nr_feptx0(RU_t *ru,int tti_tx,int first_symbol, int num_symbols, int aa) {
  //VCD_SIGNAL_DUMPER_DUMP_FUNCTION_BY_NAME(VCD_SIGNAL_DUMPER_FUNCTIONS_PHY_PROCEDURES_RU_FEPTX_OFDM+(first_symbol!=0?1:0), 0);
 }
-void CUDA_prec_ofdm(RU_t *ru,int frame_tx,int tti_tx){
-  	nfapi_nr_config_request_t *cfg = &ru->gNB_list[0]->gNB_config;
-	if(nr_slot_select(cfg, tti_tx) == SF_UL) return;
-	int slot = tti_tx;
-	NR_DL_FRAME_PARMS *fp = ru->nr_frame_parms;
-	PHY_VARS_gNB *gNB = ru->gNB_list[0];
-	int nb_antenna_ports = 8;
-	//data L1 to ru
-    for(int p=0; p<nb_antenna_ports; ++p){
-		memcpy((void*)ru->common.txdataF[p], (void*)&gNB->common_vars.txdataF[p],
-				fp->ofdm_symbol_size*sizeof(int32_t)*fp->symbols_per_slot);
-		//fake data
-		for(int j=0; j<fp->ofdm_symbol_size*fp->symbols_per_slot; j++){
-			((short*)&ru->common.txdataF[p][j])[0] = 1;
-			((short*)&ru->common.txdataF[p][j])[1] = 1;
-		}
-    }
-	CUDA_beam_precoding((int**)ru->common.txdataF, (int***)ru->beam_weights[0], fp->L_ssb, 3,
-			fp->ofdm_symbol_size, fp->symbols_per_slot, nb_antenna_ports, ru->nb_tx);
-	CUDA_ifft_ofdm((int**)ru->common.txdata, 
-			 fp->ofdm_symbol_size, fp->symbols_per_slot, 
-			 fp->nb_prefix_samples, fp->nb_prefix_samples0, ru->nb_tx,
-			 fp->Ncp, CYCLIC_PREFIX);
-}
 void nr_feptx_ofdm_2thread(RU_t *ru,int frame_tx,int tti_tx) {
-printf("nr_feptx_ofdm_2thread : frame_tx:%d tti_tx:%d\n", frame_tx, tti_tx);return; 
  nfapi_nr_config_request_t *cfg = &ru->gNB_list[0]->gNB_config;
  RU_proc_t  *proc  = &ru->proc;
@@ -245,7 +220,6 @@ printf("nr_feptx_ofdm_2thread : frame_tx:%d tti_tx:%d\n", frame_tx, tti_tx);retu
 }
 static void *nr_feptx_thread(void *param) {
  RU_feptx_t *feptx = (RU_feptx_t *)param;
  RU_t       *ru;
  int         aa, slot, start, l, nb_antenna_ports, ret;