perf cufft(symbol)

a678c7eb · Fang-WANG · 13752beb · a678c7eb · a678c7eb · a678c7eb
Commit a678c7eb authored Jun 07, 2021 by Fang-WANG
5 changed files
--- a/hs/CMakeLists.txt
+++ b/hs/CMakeLists.txt
+cmake_minimum_required(VERSION 2.8)
+project(run)
+FIND_PACKAGE(CUDA REQUIRED)
+# Pass options to NVCC
+# 由于cuda采用NVCC编译而不是gCC编译，因此需要将参数传递给NVCC
+set(CUDA_NVCC_FLAGS ${CUDA_NVCC_FLAGS} -lcufft)
+# For compilation ...
+# Specify target & source files to compile it from
+CUDA_ADD_EXECUTABLE(run cuFFT1.cu)
+# For linking ...
+# Specify target & libraries to link it with
+CUDA_ADD_CUFFT_TO_TARGET(run)
+# 添加对gdb的支持
+# SET(CMAKE_BUILD_TYPE "Debug")
+# SET(CMAKE_CXX_FLAGS_DEBUG "$ENV{CXXFLAGS} -O0 -Wall -g2 -ggdb")
+# SET(CMAKE_CXX_FLAGS_RELEASE "$ENV{CXXFLAGS} -O3 -Wall")
--- a/hs/cuFFT1.cu
+++ b/hs/cuFFT1.cu
+#include <stdio.h>
+#include <cufft.h>
+
+//gdb debug
+// void testtest(int16_t *x,int16_t *y,unsigned char scale)
+// {
+//     printf("testtest \n");
+// }
+#define LEN 2048 //signal sampling points
+void cudft2048(int16_t *x,int16_t *y,unsigned char scale)
+{
+    // testtest(0,0,0);
+    // printf("hs222222：\n");
+    cufftComplex *CompData = (cufftComplex*)malloc(LEN * sizeof(cufftComplex));//allocate memory for the data in host
+    cufftComplex *CompData1 = (cufftComplex*)malloc(LEN * sizeof(cufftComplex));
+    for (int i = 0; i < LEN; i++)
+    {
+        // printf("%d\n",i);
+        CompData[i].x = x[i*2];
+        CompData[i].y = x[i*2+1];
+    }
+
+    cufftComplex *d_fftData;
+    cudaMalloc((void**)&d_fftData, LEN * sizeof(cufftComplex));// allocate memory for the data in device
+    cudaMemcpy(d_fftData, CompData, LEN * sizeof(cufftComplex), cudaMemcpyHostToDevice);// copy data from host to device
+    
+    cufftHandle plan;// cuda library function handle
+    cufftPlan1d(&plan, LEN, CUFFT_C2C, 1);//declaration
+    cufftExecC2C(plan, (cufftComplex*)d_fftData, (cufftComplex*)d_fftData, CUFFT_FORWARD);//execute
+    cudaDeviceSynchronize();//wait to be done
+    cudaMemcpy(CompData1, d_fftData, LEN * sizeof(cufftComplex), cudaMemcpyDeviceToHost);// copy the result from device to host
+
+    for (int i = 0; i < LEN; i++)
+    {
+        y[i*2] = CompData1[i].x/45.2;
+        y[i*2+1] = CompData1[i].y/45.2;
+    }
+    // printf("hs1111111111111111:\n");
+    // for (int i = 0; i < LEN; i++)
+    // {
+    //     printf("a=%d + %dj\tb=%d + %dj\n", x[i*2],x[i*2+1],y[i*2],y[i*2+1]);
+    // }
+    cufftDestroy(plan);
+    free(CompData);
+    cudaFree(d_fftData);
+
+}
+void initcudft()
+{
+}
+int main()
+{
+    int16_t *a = (int16_t *)malloc(LEN * sizeof(int32_t));
+    int i;
+    for (i = 0; i < LEN; i++)
+    {
+        *(a+2*i) = i;
+        *(a+2*i+1) = LEN-i;
+    }
+    for (i = 0; i < 3; i++)
+    {
+        int32_t *b = (int32_t *)malloc(LEN * sizeof(int32_t));
+        cudft2048((int16_t *)a,(int16_t *)b,1);
+        free(b);
+    }
+    
+}
\ No newline at end of file
--- a/hs/cuFFT2.cu
+++ b/hs/cuFFT2.cu
+#include <stdio.h>
+#include <cufft.h>
+#include<cuda_runtime.h>
+#define LEN 2048
+
+__global__ void int_cufftComplex(int16_t *a, cufftComplex *b, int length)
+{
+    int id = (blockIdx.x * blockDim.x + threadIdx.x);
+    if(id >=length)
+    {
+        return;
+    }
+    b[id].x = a[id*2];
+    b[id].y = a[id*2+1];
+}
+
+__global__ void cufftComplex_int(cufftComplex *a, int16_t *b, int length)
+{
+    int id = (blockIdx.x * blockDim.x + threadIdx.x);
+    if(id >=length)
+    {
+        return;
+    }
+    b[id*2] = a[id].x/45.2;
+    b[id*2+1] = a[id].y/45.2;
+}
+
+void initcudft()
+{
+}
+
+void cudft2048(int16_t *x,int16_t *y,unsigned char scale)
+{
+
+    int16_t *x1;
+    cudaMalloc((void**)&x1, LEN * sizeof(int32_t));
+    cudaMemcpy(x1, x, LEN * sizeof(int32_t), cudaMemcpyHostToDevice);
+
+    int threadNum = 512;
+    int blockNum = 4;
+    cufftComplex *CompData;
+    cudaMalloc((void**)&CompData, LEN * sizeof(cufftComplex));
+    int_cufftComplex<<<blockNum, threadNum>>>(x1, CompData, LEN);
+
+    cufftHandle plan;// cuda library function handle
+    cufftPlan1d(&plan, LEN, CUFFT_C2C, 1);//declaration,这句要warm-up
+    cufftExecC2C(plan, (cufftComplex*)CompData, (cufftComplex*)CompData, CUFFT_FORWARD);//execute
+    cudaDeviceSynchronize();//wait to be done
+
+    
+    cufftComplex_int<<<blockNum, threadNum>>>(CompData, x1, LEN);
+    cudaMemcpy(y, x1, LEN * sizeof(int32_t), cudaMemcpyDeviceToHost);// copy the result from device to host
+
+    // printf("hs1111111111111111:\n");
+    // for (int i = 0; i < LEN; i++)
+    // {
+    //     printf("a=%d + %dj\tb=%d + %dj\n", x[i*2],x[i*2+1],y[i*2],y[i*2+1]);
+    // }
+
+    cufftDestroy(plan);
+    cudaFree(CompData);
+    cudaFree(x1);
+}
+
+int load_cuFFT(void) 
+{
+    initcudft();
+    cudft2048(0,0,1);
+    return 0;
+}
+
+int main()
+{
+    load_cuFFT();
+    int16_t *a = (int16_t *)malloc(LEN * sizeof(int32_t));
+    int16_t *b = (int16_t *)malloc(LEN * sizeof(int32_t));
+    int i;
+    for (i = 0; i < LEN; i++)
+    {
+        *(a+2*i) = i;
+        *(a+2*i+1) = LEN-i;
+    }
+    for (i = 0; i < 3; i++)
+    {
+        cudft2048((int16_t *)a,(int16_t *)b,0);
+    }
+
+        
+
+}
\ No newline at end of file
--- a/hs/cuFFT3.cu
+++ b/hs/cuFFT3.cu
+#include <stdio.h>
+#include <cufft.h>
+#include<cuda_runtime.h>
+#define LEN 2048
+#define SQRT2048_real 45.2876
+#define SQRT2048_imag 45.3065
+
+__global__ void int_cufftComplex(int16_t *a, cufftComplex *b, int length)
+{
+    int id = (blockIdx.x * blockDim.x + threadIdx.x);
+    if(id >=length)
+    {
+        return;
+    }
+    b[id].x = a[id*2];
+    b[id].y = a[id*2+1];
+}
+
+__global__ void cufftComplex_int(cufftComplex *a, int16_t *b, int length)
+{
+    int id = (blockIdx.x * blockDim.x + threadIdx.x);
+    if(id >=length)
+    {
+        return;
+    }
+    b[id*2] = a[id].x/SQRT2048_real;
+    b[id*2+1] = a[id].y/SQRT2048_imag;
+    // b[id*2] = a[id].x;
+    // b[id*2+1] = a[id].y;
+}
+
+
+int16_t *x1;
+cufftComplex *CompData;
+cufftHandle plan;
+void initcudft()
+{
+    cudaMalloc((void**)&x1, LEN * sizeof(int32_t));
+    cudaMalloc((void**)&CompData, LEN * sizeof(cufftComplex));
+    cufftPlan1d(&plan, LEN, CUFFT_C2C, 1);//declaration,这句要warm-up
+}
+void destroycudft()
+{
+    // cudaFree(CompData);
+    // cudaFree(x1);
+    // cufftDestroy(plan);
+}
+
+void cudft2048(int16_t *x,int16_t *y,unsigned char scale)
+{
+
+    // int16_t *x1;
+    // cudaMalloc((void**)&x1, LEN * sizeof(int32_t));
+    cudaMemcpy(x1, x, LEN * sizeof(int32_t), cudaMemcpyHostToDevice);
+
+
+    int threadNum = 512;
+    int blockNum = 4;
+    // cufftComplex *CompData;
+    // cudaMalloc((void**)&CompData, LEN * sizeof(cufftComplex));
+
+    int_cufftComplex<<<blockNum, threadNum>>>(x1, CompData, LEN);
+    // int_cufftComplex<<<1, 8>>>(x1, CompData, LEN);
+
+    // cufftHandle plan;// cuda library function handle
+    // cufftPlan1d(&plan, LEN, CUFFT_C2C, 1);//declaration,这句要warm-up
+    cufftExecC2C(plan, (cufftComplex*)CompData, (cufftComplex*)CompData, CUFFT_FORWARD);//execute
+    cudaDeviceSynchronize();//wait to be done
+
+    cufftComplex_int<<<blockNum, threadNum>>>(CompData, x1, LEN);
+    // cufftComplex_int<<<1, 8>>>(CompData, x1, LEN);
+    cudaMemcpy(y, x1, LEN * sizeof(int32_t), cudaMemcpyDeviceToHost);// copy the result from device to host
+
+    // printf("hs1111111111111111:\n");
+
+    // cufftDestroy(plan);
+    // cudaFree(CompData);
+    // cudaFree(x1);
+}
+
+void load_cuFFT(void) 
+{
+    initcudft();
+    int16_t *a = (int16_t *)malloc(LEN * sizeof(int32_t));
+    int16_t *b = (int16_t *)malloc(LEN * sizeof(int32_t));
+    cudft2048(a,b,1);
+}
+
+int main()
+{
+    load_cuFFT();
+    int16_t *a = (int16_t *)malloc(LEN * sizeof(int32_t));
+    int16_t *b = (int16_t *)malloc(LEN * sizeof(int32_t));
+    for (int i = 0; i < LEN; i++)
+    {
+        *(a+2*i) = i;
+        *(a+2*i+1) = LEN-i;
+    }
+    for (int i = 0; i < 3; i++)
+    {
+        cudft2048((int16_t *)a,(int16_t *)b,0);
+        printf("hs1111111111111111:\n");
+        for (int j = 0; j < LEN; j++)
+        {
+            printf("a=%d + %dj\tb=%d + %dj\n", a[j*2],a[j*2+1],b[j*2],b[j*2+1]);
+        }
+    }
+    destroycudft();
+
+}
\ No newline at end of file
--- a/targets/PROJECTS/GENERIC-NR-5GC/CONF/gnb.sa.band78.fr1.106PRB.usrpb210.conf
+++ b/targets/PROJECTS/GENERIC-NR-5GC/CONF/gnb.sa.band78.fr1.106PRB.usrpb210.conf
@@ -262,7 +262,7 @@ THREAD_STRUCT = (
    #three config for level of parallelism "PARALLEL_SINGLE_THREAD", "PARALLEL_RU_L1_SPLIT", or "PARALLEL_RU_L1_TRX_SPLIT"
    parallel_config    = "PARALLEL_SINGLE_THREAD";
    #two option for worker "WORKER_DISABLE" or "WORKER_ENABLE"
-    worker_config      = "WORKER_ENABLE";
+    worker_config      = "WORKER_DISABLE";
  }
 );