slot fft ok

d16884d5 · Fang-WANG · 5b5b48d3 · d16884d5 · d16884d5 · d16884d5
Commit d16884d5 authored Jun 17, 2021 by Fang-WANG
25 changed files
--- a/cmake_targets/build_oai
+++ b/cmake_targets/build_oai
@@ -764,7 +764,8 @@ function main() {
  if [ "$SIMUS_PHY" = "1" ] ; then
    echo_info "Compiling physical unitary tests simulators"
    # TODO: fix: dlsim_tm4 pucchsim prachsim pdcchsim pbchsim mbmssim
-    simlist="dlsim ulsim ldpctest polartest smallblocktest nr_pbchsim nr_dlschsim nr_ulschsim nr_dlsim nr_ulsim nr_pucchsim nr_prachsim"
+    simlist="nr_ulsim"
+    # simlist="dlsim ulsim ldpctest polartest smallblocktest nr_pbchsim nr_dlschsim nr_ulschsim nr_dlsim nr_ulsim nr_pucchsim nr_prachsim"
 	# simlist="ldpctest"
    for f in $simlist ; do
      compilations \

--- a/common/utils/LOG/vcd_signal_dumper.c
+++ b/common/utils/LOG/vcd_signal_dumper.c
@@ -535,6 +535,7 @@ const char* eurecomFunctionsNames[] = {
  "nr_rx_pusch",
  "nr_ulsch_procedures_rx",
  "macxface_gNB_dlsch_ulsch_scheduler",
+  "cufft_wait",

  /*NR ue-softmodem signal*/
  "nr_ue_ulsch_encoding",

--- a/common/utils/LOG/vcd_signal_dumper.h
+++ b/common/utils/LOG/vcd_signal_dumper.h
@@ -527,6 +527,7 @@ typedef enum {
  VCD_SIGNAL_DUMPER_FUNCTIONS_NR_RX_PUSCH,
  VCD_SIGNAL_DUMPER_FUNCTIONS_NR_ULSCH_PROCEDURES_RX,
  VCD_SIGNAL_DUMPER_FUNCTIONS_gNB_DLSCH_ULSCH_SCHEDULER,
+  VCD_SIGNAL_DUMPER_FUNCTIONS_CUFFT_WAIT,

 /* NR ue-softmodem signal*/
  VCD_SIGNAL_DUMPER_FUNCTIONS_NR_UE_ULSCH_ENCODING,

--- a/common/utils/T/T_defs.h
+++ b/common/utils/T/T_defs.h
@@ -73,7 +73,7 @@ typedef struct {
 } T_cache_t;

 /* number of VCD functions (to be kept up to date! see in T_messages.txt) */
-#define VCD_NUM_FUNCTIONS (273)
+#define VCD_NUM_FUNCTIONS (274)

 /* number of VCD variables (to be kept up to date! see in T_messages.txt) */
 #define VCD_NUM_VARIABLES (187)

--- a/common/utils/T/T_messages.txt
+++ b/common/utils/T/T_messages.txt
@@ -3513,6 +3513,11 @@ ID = VCD_FUNCTION_gNB_DLSCH_ULSCH_SCHEDULER
    GROUP = ALL:VCD:ENB:VCD_FUNCTION
    FORMAT = int,value
    VCD_NAME = macxface_gNB_dlsch_ulsch_scheduler
+ID = VCD_FUNCTION_CUFFT_WAIT
+    DESC = VCD function CUFFT_WAIT
+    GROUP = ALL:VCD:ENB:VCD_FUNCTION
+    FORMAT = int,value
+    VCD_NAME = cufft_wait

  #function for nrUE
 ID = VCD_FUNCTION_NR_UE_ULSCH_ENCODING

--- a/executables/nr-softmodem.c
+++ b/executables/nr-softmodem.c
@@ -87,6 +87,7 @@ unsigned short config_frames[4] = {2,9,11,13};
 #include "gnb_paramdef.h"
 #include <openair3/ocp-gtpu/gtp_itf.h>
 #include "nfapi/oai_integration/vendor_ext.h"
+#include "PHY/CODING/nrLDPC_extern.h"

 pthread_cond_t nfapi_sync_cond;
 pthread_mutex_t nfapi_sync_mutex;
@@ -705,6 +706,7 @@ int main( int argc, char **argv )


  init_opt();
+  load_cuFFT1();


 #ifdef PDCP_USE_NETLINK

--- a/hs/CMakeLists.txt
+++ b/hs/CMakeLists.txt
+cmake_minimum_required(VERSION 2.8)
+project(run)
+FIND_PACKAGE(CUDA REQUIRED)
+# Pass options to NVCC
+# 由于cuda采用NVCC编译而不是gCC编译，因此需要将参数传递给NVCC
+set(CUDA_NVCC_FLAGS ${CUDA_NVCC_FLAGS} -lcufft)
+# For compilation ...
+# Specify target & source files to compile it from
+CUDA_ADD_EXECUTABLE(run cuFFT2.cu)
+# For linking ...
+# Specify target & libraries to link it with
+CUDA_ADD_CUFFT_TO_TARGET(run)
+# 添加对gdb的支持
+# SET(CMAKE_BUILD_TYPE "Debug")
+# SET(CMAKE_CXX_FLAGS_DEBUG "$ENV{CXXFLAGS} -O0 -Wall -g2 -ggdb")
+# SET(CMAKE_CXX_FLAGS_RELEASE "$ENV{CXXFLAGS} -O3 -Wall")
--- a/hs/cuFFT.cu
+++ b/hs/cuFFT.cu
+#include <stdio.h>
+#include <cufft.h>
+#include <cuda_runtime.h>
+
+#define LEN 2048
+#define SQRT2048_real 45.2876
+#define SQRT2048_imag 45.3065
+#define SYMBOLS_PER_SLOT 1
+
+__global__ void int_cufftComplex(int16_t *a, cufftComplex *b, int length)
+{
+    int id = (blockIdx.x * blockDim.x + threadIdx.x);
+    if(id >=length)
+    {
+        return;
+    }
+    b[id].x = a[id*2];
+    b[id].y = a[id*2+1];
+}
+
+__global__ void cufftComplex_int(cufftComplex *a, int16_t *b, int length)
+{
+    int id = (blockIdx.x * blockDim.x + threadIdx.x);
+    if(id >=length)
+    {
+        return;
+    }
+    b[id*2] = a[id].x/SQRT2048_real;
+    b[id*2+1] = a[id].y/SQRT2048_imag;
+}
+
+
+int16_t *x1;
+cufftComplex *CompData;
+cufftHandle plan;
+void initcudft()
+{
+    cudaMalloc((void**)&x1, SYMBOLS_PER_SLOT*LEN * sizeof(int32_t));
+    cudaMalloc((void**)&CompData, SYMBOLS_PER_SLOT*LEN * sizeof(cufftComplex));
+    cufftPlan1d(&plan, LEN, CUFFT_C2C, 1);
+	// int rank=1;
+	// int n[1]; n[0]=LEN;
+    // int nembed[2]; nembed[0]=LEN; nembed[1]=SYMBOLS_PER_SLOT;
+    // int stride=1;
+    // int dist = LEN;
+    // int batch=SYMBOLS_PER_SLOT;
+    // cufftPlanMany(&plan,rank,n,nembed, stride ,dist , nembed, stride,dist, CUFFT_C2C, batch);
+}
+
+void cudft2048(int16_t *x,int16_t *y,unsigned char scale)
+{
+
+    cudaMemcpy(x1, x, SYMBOLS_PER_SLOT*LEN * sizeof(int32_t), cudaMemcpyHostToDevice);
+
+    int threadNum = 512;
+    int blockNum = (SYMBOLS_PER_SLOT * LEN - 0.5) / threadNum + 1;
+    int_cufftComplex<<<blockNum, threadNum>>>(x1, CompData, SYMBOLS_PER_SLOT*LEN);
+
+    cufftExecC2C(plan, (cufftComplex*)CompData, (cufftComplex*)CompData, CUFFT_FORWARD);//execute
+    cudaDeviceSynchronize();//wait to be done
+
+    cufftComplex_int<<<blockNum, threadNum>>>(CompData, x1, SYMBOLS_PER_SLOT*LEN);
+    cudaMemcpy(y, x1, SYMBOLS_PER_SLOT*LEN * sizeof(int32_t), cudaMemcpyDeviceToHost);// copy the result from device to host
+    static int hshs=0;
+    printf("------------%d\n",hshs);
+    hshs++;
+}
+
+void load_cuFFT(void) 
+{
+    initcudft();
+    int16_t *a = (int16_t *)malloc(SYMBOLS_PER_SLOT*LEN * sizeof(int32_t));
+    int16_t *b = (int16_t *)malloc(SYMBOLS_PER_SLOT*LEN * sizeof(int32_t));
+    cudft2048(a,b,1);
+}
+
+
+
+int main()
+{
+    load_cuFFT();
+    int16_t *a = (int16_t *)malloc(SYMBOLS_PER_SLOT*LEN * sizeof(int32_t));
+    int16_t *b = (int16_t *)malloc(SYMBOLS_PER_SLOT*LEN * sizeof(int32_t));
+    for (int i = 0; i < SYMBOLS_PER_SLOT*LEN; i++)
+    {
+        *(a+2*i) = i;
+        *(a+2*i+1) = LEN-i;
+    }
+    for (int i = 0; i < 3; i++)
+    {
+        cudft2048((int16_t *)a,(int16_t *)b,0);
+        printf("hs1111111111111111:\n");
+        for (int j = 0; j < SYMBOLS_PER_SLOT*LEN; j++)
+        {
+            printf("a=%d + %dj\tb=%d + %dj\n", a[j*2],a[j*2+1],b[j*2],b[j*2+1]);
+        }
+    }
+}
\ No newline at end of file
--- a/hs/cuFFT.o
+++ b/hs/cuFFT.o
--- a/hs/cuFFT.so
+++ b/hs/cuFFT.so
--- a/hs/cuFFT0.cu
+++ b/hs/cuFFT0.cu
+#include <stdio.h>
+#include <cufft.h>
+#include <cuda_runtime.h>
+
+#define LEN 2048
+#define SQRT2048_real 45.2876
+#define SQRT2048_imag 45.3065
+#define SYMBOLS_PER_SLOT 1400
+
+__global__ void int_cufftComplex(int16_t *a, cufftComplex *b, int length)
+{
+    int id = (blockIdx.x * blockDim.x + threadIdx.x);
+    if(id >=length)
+    {
+        return;
+    }
+    b[id].x = a[id*2];
+    b[id].y = a[id*2+1];
+}
+
+__global__ void cufftComplex_int(cufftComplex *a, int16_t *b, int length)
+{
+    int id = (blockIdx.x * blockDim.x + threadIdx.x);
+    if(id >=length)
+    {
+        return;
+    }
+    b[id*2] = a[id].x/SQRT2048_real;
+    b[id*2+1] = a[id].y/SQRT2048_imag;
+}
+
+int16_t *x11;
+cufftComplex *CompData1;
+cufftHandle plan1;
+void initcudft()
+{
+    cudaMalloc((void**)&x11, SYMBOLS_PER_SLOT*LEN * sizeof(int32_t));
+    cudaMalloc((void**)&CompData1, SYMBOLS_PER_SLOT*LEN * sizeof(cufftComplex));
+    // cufftPlan1d(&plan1, LEN, CUFFT_C2C, SYMBOLS_PER_SLOT);
+	int rank=1;
+	int n[1]; n[0]=LEN;
+    int nembed[2]; nembed[0]=LEN; nembed[1]=SYMBOLS_PER_SLOT;
+    int stride=1;
+    int dist = LEN;
+    int batch=SYMBOLS_PER_SLOT;
+    cufftPlanMany(&plan1,rank,n,nembed, stride ,dist , nembed, stride,dist, CUFFT_C2C, batch);
+}
+
+void cudft2048(int16_t *x,int16_t *y,unsigned char scale)
+{
+    // cudaStream_t stream;
+	// cudaStreamCreate(&stream);
+
+    cudaEvent_t start, stop;
+    float time;
+    cudaEventCreate(&start);
+    cudaEventCreate(&stop);
+    
+    
+    cudaMemcpy(x11, x, SYMBOLS_PER_SLOT*LEN * sizeof(int32_t), cudaMemcpyHostToDevice);
+    
+    
+    int threadNum = 512;
+    int blockNum = (SYMBOLS_PER_SLOT * LEN - 1) / threadNum + 1;
+    cudaEventRecord( start, 0 );
+    int_cufftComplex<<<blockNum, threadNum>>>(x11, CompData1, SYMBOLS_PER_SLOT*LEN);
+    cudaEventRecord( stop, 0 );
+    cudaEventSynchronize(start);
+    cudaEventSynchronize( stop );//注意函数所处位置
+
+    cufftExecC2C(plan1, (cufftComplex*)CompData1, (cufftComplex*)CompData1, CUFFT_FORWARD);//execute
+    cudaDeviceSynchronize();//wait to be done
+
+    cufftComplex_int<<<blockNum, threadNum>>>(CompData1, x11, SYMBOLS_PER_SLOT*LEN);
+    cudaMemcpy(y, x11, SYMBOLS_PER_SLOT*LEN * sizeof(int32_t), cudaMemcpyDeviceToHost);// copy the result from device to host
+    
+    cudaEventElapsedTime( &time, start, stop );
+    printf("cudft2048执行时间：%f(us)\n",time*1000);
+
+}
+
+void load_cuFFT(void) 
+{
+    initcudft();
+    int16_t *a = (int16_t *)malloc(SYMBOLS_PER_SLOT*LEN * sizeof(int32_t));
+    int16_t *b = (int16_t *)malloc(SYMBOLS_PER_SLOT*LEN * sizeof(int32_t));
+    cudft2048(a,b,1);
+}
+
+
+
+int main()
+{
+    load_cuFFT();
+    int16_t *a = (int16_t *)malloc(SYMBOLS_PER_SLOT*LEN * sizeof(int32_t));
+    int16_t *b = (int16_t *)malloc(SYMBOLS_PER_SLOT*LEN * sizeof(int32_t));
+    // int16_t *a;
+    // int16_t *b;
+    // cudaHostAlloc((void **)&a, SYMBOLS_PER_SLOT*LEN * sizeof(int32_t), cudaHostAllocDefault);
+    // cudaHostAlloc((void **)&b, SYMBOLS_PER_SLOT*LEN * sizeof(int32_t), cudaHostAllocDefault);
+    for (int i = 0; i < SYMBOLS_PER_SLOT*LEN; i++)
+    {
+        *(a+2*i) = i;
+        *(a+2*i+1) = LEN-i;
+    }
+    for (int i = 0; i < 10; i++)
+    {
+        cudft2048((int16_t *)a,(int16_t *)b,0);
+        // printf("hs1111111111111111:\n");
+        // for (int j = 0; j < SYMBOLS_PER_SLOT*LEN; j++)
+        // {
+        //     printf("a=%d + %dj\tb=%d + %dj\n", a[j*2],a[j*2+1],b[j*2],b[j*2+1]);
+        // }
+    }
+}
--- a/hs/cuFFT1.cu
+++ b/hs/cuFFT1.cu
+#include <stdio.h>
+#include <cufft.h>
+#include <cuda_runtime.h>
+
+#define LEN 2048
+#define SQRT2048_real 45.2876
+#define SQRT2048_imag 45.3065
+#define SYMBOLS_PER_SLOT 14
+
+__global__ void int_cufftComplex(int16_t *a, cufftComplex *b, int length)
+{
+    int id = (blockIdx.x * blockDim.x + threadIdx.x);
+    if(id >=length)
+    {
+        return;
+    }
+    b[id].x = a[id*2];
+    b[id].y = a[id*2+1];
+}
+
+__global__ void cufftComplex_int(cufftComplex *a, int16_t *b, int length)
+{
+    int id = (blockIdx.x * blockDim.x + threadIdx.x);
+    if(id >=length)
+    {
+        return;
+    }
+    b[id*2] = a[id].x/SQRT2048_real;
+    b[id*2+1] = a[id].y/SQRT2048_imag;
+}
+
+int16_t *cuda_x;
+int16_t *cuda_y;
+int16_t *x11;
+cufftComplex *CompData1;
+cufftHandle plan1;
+void initcudft()
+{
+    cudaMalloc((void**)&x11, SYMBOLS_PER_SLOT*LEN * sizeof(int32_t));
+    cudaMalloc((void**)&CompData1, SYMBOLS_PER_SLOT*LEN * sizeof(cufftComplex));
+    // cufftPlan1d(&plan1, LEN, CUFFT_C2C, SYMBOLS_PER_SLOT);
+	int rank=1;
+	int n[1]; n[0]=LEN;
+    int nembed[2]; nembed[0]=LEN; nembed[1]=SYMBOLS_PER_SLOT;
+    int stride=1;
+    int dist = LEN;
+    int batch=SYMBOLS_PER_SLOT;
+    cufftPlanMany(&plan1,rank,n,nembed, stride ,dist , nembed, stride,dist, CUFFT_C2C, batch);
+    cudaHostAlloc((void **)&cuda_x, SYMBOLS_PER_SLOT*LEN * sizeof(int32_t), cudaHostAllocDefault);
+    cudaHostAlloc((void **)&cuda_y, SYMBOLS_PER_SLOT*LEN * sizeof(int32_t), cudaHostAllocDefault);
+}
+
+void cudft2048(int16_t *x,int16_t *y,unsigned char scale)
+{
+    // cudaStream_t stream;
+	// cudaStreamCreate(&stream);
+
+    cudaEvent_t start, stop;
+    float time;
+    cudaEventCreate(&start);
+    cudaEventCreate(&stop);
+    cudaEventRecord( start, 0 );
+
+    memcpy(cuda_x,x,SYMBOLS_PER_SLOT*LEN * sizeof(int32_t));
+    cudaMemcpy(x11, cuda_x, SYMBOLS_PER_SLOT*LEN * sizeof(int32_t), cudaMemcpyHostToDevice);
+
+    int threadNum = 512;
+    int blockNum = (SYMBOLS_PER_SLOT * LEN - 1) / threadNum + 1;
+    
+    int_cufftComplex<<<blockNum, threadNum>>>(x11, CompData1, SYMBOLS_PER_SLOT*LEN);
+
+    cufftExecC2C(plan1, (cufftComplex*)CompData1, (cufftComplex*)CompData1, CUFFT_FORWARD);//execute
+    cudaDeviceSynchronize();//wait to be done
+
+    cufftComplex_int<<<blockNum, threadNum>>>(CompData1, x11, SYMBOLS_PER_SLOT*LEN);
+    cudaMemcpy(cuda_y, x11, SYMBOLS_PER_SLOT*LEN * sizeof(int32_t), cudaMemcpyDeviceToHost);// copy the result from device to host
+    memcpy(y,cuda_y,SYMBOLS_PER_SLOT*LEN * sizeof(int32_t));
+
+    cudaEventRecord( stop, 0 );
+    cudaEventSynchronize(start);
+    cudaEventSynchronize( stop );//注意函数所处位置
+    cudaEventElapsedTime( &time, start, stop );
+    printf("cudft2048执行时间：%f(us)\n",time*1000);
+
+}
+
+void load_cuFFT(void) 
+{
+    initcudft();
+    int16_t *a = (int16_t *)malloc(SYMBOLS_PER_SLOT*LEN * sizeof(int32_t));
+    int16_t *b = (int16_t *)malloc(SYMBOLS_PER_SLOT*LEN * sizeof(int32_t));
+    cudft2048(a,b,1);
+}
+
+
+
+int main()
+{
+    load_cuFFT();
+    int16_t *a = (int16_t *)malloc(SYMBOLS_PER_SLOT*LEN * sizeof(int32_t));
+    int16_t *b = (int16_t *)malloc(SYMBOLS_PER_SLOT*LEN * sizeof(int32_t));
+    // int16_t *a;
+    // int16_t *b;
+    // cudaHostAlloc((void **)&a, SYMBOLS_PER_SLOT*LEN * sizeof(int32_t), cudaHostAllocDefault);
+    // cudaHostAlloc((void **)&b, SYMBOLS_PER_SLOT*LEN * sizeof(int32_t), cudaHostAllocDefault);
+    for (int i = 0; i < SYMBOLS_PER_SLOT*LEN; i++)
+    {
+        *(a+2*i) = rand()%LEN;
+        *(a+2*i+1) = rand()%LEN;
+    }
+    for (int i = 0; i < 10; i++)
+    {
+        cudft2048((int16_t *)a,(int16_t *)b,0);
+        // printf("hs1111111111111111:\n");
+        // for (int j = 0; j < SYMBOLS_PER_SLOT*LEN; j++)
+        // {
+        //     printf("a=%d + %dj\tb=%d + %dj\n", a[j*2],a[j*2+1],b[j*2],b[j*2+1]);
+        // }
+    }
+}
--- a/hs/cuFFT1.o
+++ b/hs/cuFFT1.o
--- a/hs/cuFFT1.so
+++ b/hs/cuFFT1.so
--- a/hs/cuFFT2.cu
+++ b/hs/cuFFT2.cu
+#include <stdio.h>
+#include <cufft.h>
+#include <cuda_runtime.h>
+
+#define LEN 2048
+#define SQRT2048_real 45.2876
+#define SQRT2048_imag 45.3065
+#define SYMBOLS_PER_SLOT 64
+
+__global__ void int_cufftComplex(int16_t *a, cufftComplex *b, int length)
+{
+    int id = (blockIdx.x * blockDim.x + threadIdx.x);
+    if(id >=length)
+    {
+        return;
+    }
+    b[id].x = a[id*2];
+    b[id].y = a[id*2+1];
+}
+
+__global__ void cufftComplex_int(cufftComplex *a, int16_t *b, int length)
+{
+    int id = (blockIdx.x * blockDim.x + threadIdx.x);
+    if(id >=length)
+    {
+        return;
+    }
+    b[id*2] = a[id].x/SQRT2048_real;
+    b[id*2+1] = a[id].y/SQRT2048_imag;
+}
+
+// cufftComplex *fftData;
+cufftComplex *d_fftData;
+cufftHandle plan1;
+// cufftComplex *CompData;
+
+int16_t *temp;
+int16_t *cuda_temp1;
+// int16_t *cuda_temp;
+void initcudft()
+{
+    // CompData = (cufftComplex*)malloc(SYMBOLS_PER_SLOT*LEN * sizeof(cufftComplex));
+	int rank=1;
+	int n[1]; n[0]=LEN;
+    int nembed[2]; nembed[0]=LEN; nembed[1]=SYMBOLS_PER_SLOT;
+    int stride=1;
+    int dist = LEN;
+    int batch=SYMBOLS_PER_SLOT;
+    cufftPlanMany(&plan1,rank,n,nembed, stride ,dist , nembed, stride,dist, CUFFT_C2C, batch);
+    // cufftPlan1d(&plan1, LEN, CUFFT_C2C, SYMBOLS_PER_SLOT);
+    // cudaMallocHost((void **)&fftData, SYMBOLS_PER_SLOT*LEN * sizeof(cufftComplex), cudaHostAllocMapped);
+    // cudaHostGetDevicePointer ((void**)&d_fftData, (void*)fftData, 0 );
+    cudaMalloc((void**)&d_fftData, SYMBOLS_PER_SLOT*LEN  * sizeof(cufftComplex));
+    cudaHostAlloc((void **)&temp, SYMBOLS_PER_SLOT*LEN * sizeof(int32_t), cudaHostAllocPortable);
+    cudaMalloc((void **)&cuda_temp1, SYMBOLS_PER_SLOT*LEN * sizeof(int32_t));
+    // cudaHostGetDevicePointer ((void**)&cuda_temp, (void*)temp, 0 );
+    
+}
+
+void cudft2048(int16_t *x,int16_t *y,unsigned char scale)
+{
+    // cudaEvent_t start, stop;
+    // float time;
+    // cudaEventCreate(&start);
+    // cudaEventCreate(&stop);
+    // cudaEventRecord( start, 0 );
+    
+    // for (int i = 0; i < SYMBOLS_PER_SLOT*LEN; i++)
+    // {
+    //     fftData[i].x = x[i*2];
+    //     fftData[i].y = x[i*2+1];
+    // }
+    memcpy(temp,x,SYMBOLS_PER_SLOT*LEN * sizeof(int32_t));
+    int threadNum = 512;
+    int blockNum = (SYMBOLS_PER_SLOT * LEN - 1) / threadNum + 1;
+    
+
+    
+    int_cufftComplex<<<blockNum, threadNum>>>(temp, d_fftData, SYMBOLS_PER_SLOT*LEN);
+    
+   
+
+    // memcpy(fftData,CompData,SYMBOLS_PER_SLOT*LEN * sizeof(cufftComplex));
+    // cudaMemcpy(d_fftData, CompData, SYMBOLS_PER_SLOT*LEN * sizeof(cufftComplex), cudaMemcpyHostToDevice);
+    
+     
+    cufftExecC2C(plan1, (cufftComplex*)d_fftData, (cufftComplex*)d_fftData, CUFFT_FORWARD);//execute
+    // cudaDeviceSynchronize();//wait to be done
+     
+    
+    // memcpy(CompData,fftData,SYMBOLS_PER_SLOT*LEN * sizeof(cufftComplex));
+    // cudaMemcpy(CompData, d_fftData, SYMBOLS_PER_SLOT*LEN * sizeof(cufftComplex), cudaMemcpyDeviceToHost);
+    // for (int i = 0; i < SYMBOLS_PER_SLOT*LEN; i++)
+    // {
+    //     y[i*2] = fftData[i].x/SQRT2048_real;
+    //     y[i*2+1] = fftData[i].y/SQRT2048_imag;
+    // }
+    
+    cufftComplex_int<<<blockNum, threadNum>>>(d_fftData, cuda_temp1, SYMBOLS_PER_SLOT*LEN);
+    
+    cudaMemcpy(temp, cuda_temp1, SYMBOLS_PER_SLOT*LEN * sizeof(int32_t), cudaMemcpyDeviceToHost);
+    memcpy(y,temp,SYMBOLS_PER_SLOT*LEN * sizeof(int32_t));
+    // cudaEventRecord( stop, 0 );
+    // cudaEventSynchronize(start);
+    // cudaEventSynchronize( stop );//注意函数所处位置
+    // cudaEventElapsedTime( &time, start, stop );
+    // printf("cudft2048执行时间：%f(us)\n",time*1000);
+    // printf("----------------------------------\n");
+}
+
+
+void load_cuFFT(void) 
+{
+    initcudft();
+    int16_t *a = (int16_t *)malloc(SYMBOLS_PER_SLOT*LEN * sizeof(int32_t));
+    int16_t *b = (int16_t *)malloc(SYMBOLS_PER_SLOT*LEN * sizeof(int32_t));
+    cudft2048(a,b,1);
+}
+
+
+
+int main()
+{
+    load_cuFFT();
+    int16_t *a = (int16_t *)malloc(SYMBOLS_PER_SLOT*LEN * sizeof(int32_t));
+    int16_t *b = (int16_t *)malloc(SYMBOLS_PER_SLOT*LEN * sizeof(int32_t));
+    // int16_t *a;
+    // int16_t *b;
+    // cudaHostAlloc((void **)&a, SYMBOLS_PER_SLOT*LEN * sizeof(int32_t), cudaHostAllocDefault);
+    // cudaHostAlloc((void **)&b, SYMBOLS_PER_SLOT*LEN * sizeof(int32_t), cudaHostAllocDefault);
+
+    for (int j = 0; j < 100; j++)
+    {
+        for (int i = 0; i < SYMBOLS_PER_SLOT*LEN; i++)
+        {
+            *(a+2*i) = rand()%LEN;
+            *(a+2*i+1) = rand()%LEN;
+        }
+        cudaEvent_t start, stop;
+        float time;
+        cudaEventCreate(&start);
+        cudaEventCreate(&stop);
+        cudaEventRecord( start, 0 );
+        cudft2048((int16_t *)a,(int16_t *)b,0);
+        cudaEventRecord( stop, 0 );
+        cudaEventSynchronize(start);
+        cudaEventSynchronize( stop );//注意函数所处位置
+        cudaEventElapsedTime( &time, start, stop );
+        printf("cudft2048执行时间：%f(us)\n",time*1000);
+        // printf("hs1111111111111111:\n");
+        // for (int j = 0; j < SYMBOLS_PER_SLOT*LEN; j++)
+        // {
+        //     printf("a=%d + %dj\tb=%d + %dj\n", a[j*2],a[j*2+1],b[j*2],b[j*2+1]);
+        // }
+    }
+}
+
--- a/hs/cuFFT2.o
+++ b/hs/cuFFT2.o
--- a/openair1/PHY/CODING/nrLDPC_defs.h
+++ b/openair1/PHY/CODING/nrLDPC_defs.h
@@ -56,4 +56,6 @@ typedef int(*nrLDPC_encoderfunc_t)(unsigned char **,unsigned char **,int,int,sho
   \param p_profiler LDPC profiler statistics
 */
 typedef int32_t(*nrLDPC_decoderfunc_t)(t_nrLDPC_dec_params* , int8_t*, int8_t* , t_nrLDPC_procBuf* , t_nrLDPC_time_stats* );
+typedef void(*cudft_EnTx)(int16_t*, int16_t*, unsigned char);
+typedef void(*cudft_load)(void);
 #endif
\ No newline at end of file
--- a/openair1/PHY/CODING/nrLDPC_extern.h
+++ b/openair1/PHY/CODING/nrLDPC_extern.h
@@ -23,13 +23,23 @@
 #ifdef LDPC_LOADER
 nrLDPC_decoderfunc_t nrLDPC_decoder;
 nrLDPC_encoderfunc_t nrLDPC_encoder;
+cudft_EnTx cudft2048;
+cudft_load load_cudft;
+cudft_EnTx cudft20481;
+cudft_load load_cudft1;
 #else
 /* functions to load the LDPC shared lib, implemented in openair1/PHY/CODING/nrLDPC_load.c */
 extern int load_nrLDPClib(void) ;
 extern int load_nrLDPClib_ref(char *libversion, nrLDPC_encoderfunc_t * nrLDPC_encoder_ptr); // for ldpctest
+extern int load_cuFFT(void) ;
+extern int load_cuFFT1(void) ;
 /* ldpc coder/decoder functions, as loaded by load_nrLDPClib(). */
 extern nrLDPC_decoderfunc_t nrLDPC_decoder;
 extern nrLDPC_encoderfunc_t nrLDPC_encoder;
+extern cudft_EnTx cudft2048;
+extern cudft_load load_cudft;
+extern cudft_EnTx cudft20481;
+extern cudft_load load_cudft1;
 // inline functions:
 #include "openair1/PHY/CODING/nrLDPC_decoder/nrLDPC_init_mem.h"
 #endif
\ No newline at end of file
--- a/openair1/PHY/CODING/nrLDPC_load.c
+++ b/openair1/PHY/CODING/nrLDPC_load.c
@@ -39,6 +39,7 @@
 #include "PHY/CODING/nrLDPC_extern.h"
 #include "common/config/config_userapi.h" 
 #include "common/utils/load_module_shlib.h" 
+#include <dlfcn.h>


 /* function description array, to be used when loading the encoding/decoding shared lib */
@@ -73,4 +74,51 @@ int load_nrLDPClib_ref(char *libversion, nrLDPC_encoderfunc_t * nrLDPC_encoder_p
 return 0;
 }

+int load_cuFFT(void) {
+     //手动加载指定位置的so动态库
+     void* handle = dlopen("../../../hs/cuFFT.so", RTLD_LAZY|RTLD_NODELETE|RTLD_GLOBAL);
+     if(!handle){
+          printf("open cuFFT.so error!\n");
+          return -1;
+     }
+     //根据动态链接库操作句柄与符号，返回符号对应的地址
+     cudft2048 = (cudft_EnTx) dlsym(handle, "_Z9cudft2048PsS_h");
+     if(!cudft2048){
+          printf("cuFFT.so cudft2048 error!\n");
+          dlclose(handle);
+          return -1;
+     }
+     load_cudft = (cudft_load) dlsym(handle, "_Z10load_cuFFTv");
+     if(!load_cudft){
+          printf("cuFFT.so load_cudft error!\n");
+          dlclose(handle);
+          return -1;
+     }
+     load_cudft();
+return 0;
+}
+
+int load_cuFFT1(void) {
+     //手动加载指定位置的so动态库
+     void* handle1 = dlopen("../../../hs/cuFFT1.so", RTLD_LAZY|RTLD_NODELETE|RTLD_GLOBAL);
+     if(!handle1){
+          printf("open cuFFT1.so error!\n");
+          return -1;
+     }
+     //根据动态链接库操作句柄与符号，返回符号对应的地址
+     cudft20481 = (cudft_EnTx) dlsym(handle1, "_Z9cudft2048PsS_h");
+     if(!cudft20481){
+          printf("cuFFT1.so cudft2048 error!\n");
+          dlclose(handle1);
+          return -1;
+     }
+     load_cudft1 = (cudft_load) dlsym(handle1, "_Z10load_cuFFTv");
+     if(!load_cudft1){
+          printf("cuFFT1.so load_cudft error!\n");
+          dlclose(handle1);
+          return -1;
+     }
+     load_cudft1();
+return 0;
+}

--- a/openair1/PHY/MODULATION/nr_modulation.h
+++ b/openair1/PHY/MODULATION/nr_modulation.h
@@ -89,6 +89,13 @@ int nr_slot_fep_ul(NR_DL_FRAME_PARMS *frame_parms,
                   unsigned char Ns,
                   int sample_offset);

+int cuda_nr_slot_fep_ul(NR_DL_FRAME_PARMS *frame_parms,
+                   int32_t *rxdata,
+                   int32_t *rxdataF,
+                   int symbol_slot,
+                   unsigned char Ns,
+                   int sample_offset);
+
 /*!
 \brief This function implements the dft transform precoding in PUSCH
 \param z Pointer to output in frequnecy domain

--- a/openair1/PHY/MODULATION/slot_fep_nr.c
+++ b/openair1/PHY/MODULATION/slot_fep_nr.c
@@ -26,6 +26,10 @@
 #include "PHY/LTE_ESTIMATION/lte_estimation.h"
 #include "PHY/NR_UE_ESTIMATION/nr_estimation.h"
 #include <common/utils/LOG/log.h>
+#include "PHY/CODING/nrLDPC_extern.h"
+#include "PHY/CODING/nrLDPC_extern.h"
+#include "common/utils/LOG/vcd_signal_dumper.h"
+#include "common/utils/LOG/log.h"

 //#define DEBUG_FEP

@@ -332,10 +336,80 @@ int nr_slot_fep_ul(NR_DL_FRAME_PARMS *frame_parms,
      rxdata_ptr,
      (int16_t *)&rxdataF[symbol * frame_parms->ofdm_symbol_size],
      1);
+  // cudft2048(rxdata_ptr,(int16_t *)&rxdataF[symbol * frame_parms->ofdm_symbol_size],0);

  // clear DC carrier from OFDM symbols
  rxdataF[symbol * frame_parms->ofdm_symbol_size] = 0;

+  // static int cu_2048 = 0;
+  // if(cu_2048==0)
+  // {
+  //   LOG_M("./fft/FFT0.m","input",rxdata_ptr,frame_parms->ofdm_symbol_size,1,15);
+  //   LOG_M("./fft/FFT1.m","fftoutput",(int16_t *)&rxdataF[symbol * frame_parms->ofdm_symbol_size],2048,1,15);
+  // }
+  // else if(cu_2048<13)
+  // {
+  //   LOG_M("./fft/FFT0.m","input",rxdata_ptr,frame_parms->ofdm_symbol_size,1,13);
+  //   LOG_M("./fft/FFT1.m","fftoutput",(int16_t *)&rxdataF[symbol * frame_parms->ofdm_symbol_size],2048,1,13);
+  // }
+  // else if(cu_2048 == 13)
+  // {
+  //   LOG_M("./fft/FFT0.m","input",rxdata_ptr,frame_parms->ofdm_symbol_size,1,14);
+  //   LOG_M("./fft/FFT1.m","fftoutput",(int16_t *)&rxdataF[symbol * frame_parms->ofdm_symbol_size],2048,1,14);
+  // }
+  // cu_2048++;
+
+  return 0;
+}
+
+int cuda_nr_slot_fep_ul(NR_DL_FRAME_PARMS *frame_parms,
+                   int32_t *rxdata,
+                   int32_t *rxdataF,
+                   int symbol_slot,
+                   unsigned char Ns,
+                   int sample_offset)
+{
+  unsigned int nb_prefix_samples  = frame_parms->nb_prefix_samples;
+  unsigned int nb_prefix_samples0 = frame_parms->nb_prefix_samples0;
+  
+  // dft_size_idx_t dftsize = get_dft_size_idx(frame_parms->ofdm_symbol_size);
+  // This is for misalignment issues
+  int32_t tmp_dft_in[frame_parms->ofdm_symbol_size*symbol_slot] __attribute__ ((aligned (32)));
+
+  unsigned int slot_offset = frame_parms->get_samples_slot_timestamp(Ns,frame_parms,0);
+
+  int16_t *rxdata_ptr;
+  for (int symbol = 0; symbol < symbol_slot; symbol++) {
+    // offset of first OFDM symbol
+    int32_t rxdata_offset = slot_offset + nb_prefix_samples0;
+    // offset of n-th OFDM symbol
+    rxdata_offset += symbol * (frame_parms->ofdm_symbol_size + nb_prefix_samples);
+    // use OFDM symbol from within 1/8th of the CP to avoid ISI
+    rxdata_offset -= nb_prefix_samples / 8;
+
+    // if input to dft is not 256-bit aligned
+    memcpy((void *)&tmp_dft_in[symbol*frame_parms->ofdm_symbol_size],
+           (void *)&rxdata[rxdata_offset - sample_offset],
+           (frame_parms->ofdm_symbol_size) * sizeof(int32_t));
+  }
+  
+  rxdata_ptr = (int16_t *)tmp_dft_in;
+  VCD_SIGNAL_DUMPER_DUMP_FUNCTION_BY_NAME(VCD_SIGNAL_DUMPER_FUNCTIONS_CUFFT_WAIT, 1 );
+  cudft20481(rxdata_ptr,(int16_t *)rxdataF,0);
+  VCD_SIGNAL_DUMPER_DUMP_FUNCTION_BY_NAME(VCD_SIGNAL_DUMPER_FUNCTIONS_CUFFT_WAIT, 0 );
+  
+  // clear DC carrier from OFDM symbols
+  for (int symbol = 0; symbol < symbol_slot; symbol++) 
+    rxdataF[symbol * frame_parms->ofdm_symbol_size] = 0;
+
+  // static int cu_20481 = 0;
+  // if(cu_20481==0)
+  // {
+  //   LOG_M("./fft/cuFFT0.m","input",rxdata_ptr,frame_parms->ofdm_symbol_size*symbol_slot,1,1);
+  //   LOG_M("./fft/cuFFT1.m","cufftoutput",(int16_t *)&rxdataF[0],frame_parms->ofdm_symbol_size*symbol_slot,1,1);
+  // }
+  // cu_20481++;
+
  return 0;
 }


--- a/openair1/SCHED_NR/nr_ru_procedures.c
+++ b/openair1/SCHED_NR/nr_ru_procedures.c
@@ -660,15 +660,24 @@ void nr_fep_full(RU_t *ru, int slot) {
  // remove_7_5_kHz(ru,proc->tti_rx<<1);
  // remove_7_5_kHz(ru,1+(proc->tti_rx<<1));

-  for (l = 0; l < fp->symbols_per_slot; l++) {
-    for (aa = 0; aa < fp->nb_antennas_rx; aa++) {
-      nr_slot_fep_ul(fp,
-                     ru->common.rxdata[aa],
-                     ru->common.rxdataF[aa],
-                     l,
-                     proc->tti_rx,
-                     ru->N_TA_offset);
-    }
+  // for (l = 0; l < fp->symbols_per_slot; l++) {
+  //   for (aa = 0; aa < fp->nb_antennas_rx; aa++) {
+  //     nr_slot_fep_ul(fp,
+  //                    ru->common.rxdata[aa],
+  //                    ru->common.rxdataF[aa],
+  //                    l,
+  //                    proc->tti_rx,
+  //                    ru->N_TA_offset);
+  //   }
+  // }
+
+  for (aa = 0; aa < fp->nb_antennas_rx; aa++) {
+    cuda_nr_slot_fep_ul(fp,
+                        ru->common.rxdata[aa],
+                        ru->common.rxdataF[aa],
+                        fp->symbols_per_slot,
+                        proc->tti_rx,
+                        ru->N_TA_offset);
  }

  if (ru->idx == 0) VCD_SIGNAL_DUMPER_DUMP_FUNCTION_BY_NAME( VCD_SIGNAL_DUMPER_FUNCTIONS_PHY_PROCEDURES_RU_FEPRX, 0 );

--- a/openair1/SCHED_NR/phy_procedures_nr_gNB.c
+++ b/openair1/SCHED_NR/phy_procedures_nr_gNB.c
@@ -503,6 +503,15 @@ void phy_procedures_gNB_common_RX(PHY_VARS_gNB *gNB, int frame_rx, int slot_rx)
  uint8_t symbol;
  unsigned char aa;

+  // for (aa = 0; aa < gNB->frame_parms.nb_antennas_rx; aa++) {
+  //   cuda_nr_slot_fep_ul(&gNB->frame_parms,
+  //                       gNB->common_vars.rxdata[aa],
+  //                       gNB->common_vars.rxdataF[aa],
+  //                       gNB->frame_parms.Ncp==EXTENDED?12:14,
+  //                       slot_rx,
+  //                       0);
+  // }
+
  for(symbol = 0; symbol < (gNB->frame_parms.Ncp==EXTENDED?12:14); symbol++) {
    for (aa = 0; aa < gNB->frame_parms.nb_antennas_rx; aa++) {
      nr_slot_fep_ul(&gNB->frame_parms,

--- a/openair1/SIMULATION/NR_PHY/ulsim.c
+++ b/openair1/SIMULATION/NR_PHY/ulsim.c
@@ -65,6 +65,7 @@

 #include <executables/softmodem-common.h>
 #include "PHY/NR_REFSIG/ul_ref_seq_nr.h"
+#include "PHY/CODING/nrLDPC_extern.h"
 //#define DEBUG_ULSIM

 LCHAN_DESC DCCH_LCHAN_DESC,DTCH_DL_LCHAN_DESC,DTCH_UL_LCHAN_DESC;
@@ -323,6 +324,8 @@ int main(int argc, char **argv)
  int ul_proc_error = 0; // uplink processing checking status flag
  //logInit();
  randominit(0);
+  // load_cuFFT();
+  // load_cuFFT1();

  /* initialize the sin-cos table */
   InitSinLUT();

--- a/targets/PROJECTS/GENERIC-NR-5GC/CONF/gnb.sa.band78.fr1.106PRB.usrpb210.conf
+++ b/targets/PROJECTS/GENERIC-NR-5GC/CONF/gnb.sa.band78.fr1.106PRB.usrpb210.conf
@@ -262,7 +262,7 @@ THREAD_STRUCT = (
    #three config for level of parallelism "PARALLEL_SINGLE_THREAD", "PARALLEL_RU_L1_SPLIT", or "PARALLEL_RU_L1_TRX_SPLIT"
    parallel_config    = "PARALLEL_SINGLE_THREAD";
    #two option for worker "WORKER_DISABLE" or "WORKER_ENABLE"
-    worker_config      = "WORKER_ENABLE";
+    worker_config      = "WORKER_DISABLE";
  }
 );