Commit a678c7eb authored by Fang-WANG's avatar Fang-WANG

perf cufft(symbol)

parent 13752beb
cmake_minimum_required(VERSION 2.8)
project(run)
FIND_PACKAGE(CUDA REQUIRED)
# Pass options to NVCC
# 由于cuda采用NVCC编译而不是gCC编译,因此需要将参数传递给NVCC
set(CUDA_NVCC_FLAGS ${CUDA_NVCC_FLAGS} -lcufft)
# For compilation ...
# Specify target & source files to compile it from
CUDA_ADD_EXECUTABLE(run cuFFT1.cu)
# For linking ...
# Specify target & libraries to link it with
CUDA_ADD_CUFFT_TO_TARGET(run)
# 添加对gdb的支持
# SET(CMAKE_BUILD_TYPE "Debug")
# SET(CMAKE_CXX_FLAGS_DEBUG "$ENV{CXXFLAGS} -O0 -Wall -g2 -ggdb")
# SET(CMAKE_CXX_FLAGS_RELEASE "$ENV{CXXFLAGS} -O3 -Wall")
#include <stdio.h>
#include <cufft.h>
//gdb debug
// void testtest(int16_t *x,int16_t *y,unsigned char scale)
// {
// printf("testtest \n");
// }
#define LEN 2048 //signal sampling points
void cudft2048(int16_t *x,int16_t *y,unsigned char scale)
{
// testtest(0,0,0);
// printf("hs222222:\n");
cufftComplex *CompData = (cufftComplex*)malloc(LEN * sizeof(cufftComplex));//allocate memory for the data in host
cufftComplex *CompData1 = (cufftComplex*)malloc(LEN * sizeof(cufftComplex));
for (int i = 0; i < LEN; i++)
{
// printf("%d\n",i);
CompData[i].x = x[i*2];
CompData[i].y = x[i*2+1];
}
cufftComplex *d_fftData;
cudaMalloc((void**)&d_fftData, LEN * sizeof(cufftComplex));// allocate memory for the data in device
cudaMemcpy(d_fftData, CompData, LEN * sizeof(cufftComplex), cudaMemcpyHostToDevice);// copy data from host to device
cufftHandle plan;// cuda library function handle
cufftPlan1d(&plan, LEN, CUFFT_C2C, 1);//declaration
cufftExecC2C(plan, (cufftComplex*)d_fftData, (cufftComplex*)d_fftData, CUFFT_FORWARD);//execute
cudaDeviceSynchronize();//wait to be done
cudaMemcpy(CompData1, d_fftData, LEN * sizeof(cufftComplex), cudaMemcpyDeviceToHost);// copy the result from device to host
for (int i = 0; i < LEN; i++)
{
y[i*2] = CompData1[i].x/45.2;
y[i*2+1] = CompData1[i].y/45.2;
}
// printf("hs1111111111111111:\n");
// for (int i = 0; i < LEN; i++)
// {
// printf("a=%d + %dj\tb=%d + %dj\n", x[i*2],x[i*2+1],y[i*2],y[i*2+1]);
// }
cufftDestroy(plan);
free(CompData);
cudaFree(d_fftData);
}
void initcudft()
{
}
int main()
{
int16_t *a = (int16_t *)malloc(LEN * sizeof(int32_t));
int i;
for (i = 0; i < LEN; i++)
{
*(a+2*i) = i;
*(a+2*i+1) = LEN-i;
}
for (i = 0; i < 3; i++)
{
int32_t *b = (int32_t *)malloc(LEN * sizeof(int32_t));
cudft2048((int16_t *)a,(int16_t *)b,1);
free(b);
}
}
\ No newline at end of file
#include <stdio.h>
#include <cufft.h>
#include<cuda_runtime.h>
#define LEN 2048
__global__ void int_cufftComplex(int16_t *a, cufftComplex *b, int length)
{
int id = (blockIdx.x * blockDim.x + threadIdx.x);
if(id >=length)
{
return;
}
b[id].x = a[id*2];
b[id].y = a[id*2+1];
}
__global__ void cufftComplex_int(cufftComplex *a, int16_t *b, int length)
{
int id = (blockIdx.x * blockDim.x + threadIdx.x);
if(id >=length)
{
return;
}
b[id*2] = a[id].x/45.2;
b[id*2+1] = a[id].y/45.2;
}
void initcudft()
{
}
void cudft2048(int16_t *x,int16_t *y,unsigned char scale)
{
int16_t *x1;
cudaMalloc((void**)&x1, LEN * sizeof(int32_t));
cudaMemcpy(x1, x, LEN * sizeof(int32_t), cudaMemcpyHostToDevice);
int threadNum = 512;
int blockNum = 4;
cufftComplex *CompData;
cudaMalloc((void**)&CompData, LEN * sizeof(cufftComplex));
int_cufftComplex<<<blockNum, threadNum>>>(x1, CompData, LEN);
cufftHandle plan;// cuda library function handle
cufftPlan1d(&plan, LEN, CUFFT_C2C, 1);//declaration,这句要warm-up
cufftExecC2C(plan, (cufftComplex*)CompData, (cufftComplex*)CompData, CUFFT_FORWARD);//execute
cudaDeviceSynchronize();//wait to be done
cufftComplex_int<<<blockNum, threadNum>>>(CompData, x1, LEN);
cudaMemcpy(y, x1, LEN * sizeof(int32_t), cudaMemcpyDeviceToHost);// copy the result from device to host
// printf("hs1111111111111111:\n");
// for (int i = 0; i < LEN; i++)
// {
// printf("a=%d + %dj\tb=%d + %dj\n", x[i*2],x[i*2+1],y[i*2],y[i*2+1]);
// }
cufftDestroy(plan);
cudaFree(CompData);
cudaFree(x1);
}
int load_cuFFT(void)
{
initcudft();
cudft2048(0,0,1);
return 0;
}
int main()
{
load_cuFFT();
int16_t *a = (int16_t *)malloc(LEN * sizeof(int32_t));
int16_t *b = (int16_t *)malloc(LEN * sizeof(int32_t));
int i;
for (i = 0; i < LEN; i++)
{
*(a+2*i) = i;
*(a+2*i+1) = LEN-i;
}
for (i = 0; i < 3; i++)
{
cudft2048((int16_t *)a,(int16_t *)b,0);
}
}
\ No newline at end of file
#include <stdio.h>
#include <cufft.h>
#include<cuda_runtime.h>
#define LEN 2048
#define SQRT2048_real 45.2876
#define SQRT2048_imag 45.3065
__global__ void int_cufftComplex(int16_t *a, cufftComplex *b, int length)
{
int id = (blockIdx.x * blockDim.x + threadIdx.x);
if(id >=length)
{
return;
}
b[id].x = a[id*2];
b[id].y = a[id*2+1];
}
__global__ void cufftComplex_int(cufftComplex *a, int16_t *b, int length)
{
int id = (blockIdx.x * blockDim.x + threadIdx.x);
if(id >=length)
{
return;
}
b[id*2] = a[id].x/SQRT2048_real;
b[id*2+1] = a[id].y/SQRT2048_imag;
// b[id*2] = a[id].x;
// b[id*2+1] = a[id].y;
}
int16_t *x1;
cufftComplex *CompData;
cufftHandle plan;
void initcudft()
{
cudaMalloc((void**)&x1, LEN * sizeof(int32_t));
cudaMalloc((void**)&CompData, LEN * sizeof(cufftComplex));
cufftPlan1d(&plan, LEN, CUFFT_C2C, 1);//declaration,这句要warm-up
}
void destroycudft()
{
// cudaFree(CompData);
// cudaFree(x1);
// cufftDestroy(plan);
}
void cudft2048(int16_t *x,int16_t *y,unsigned char scale)
{
// int16_t *x1;
// cudaMalloc((void**)&x1, LEN * sizeof(int32_t));
cudaMemcpy(x1, x, LEN * sizeof(int32_t), cudaMemcpyHostToDevice);
int threadNum = 512;
int blockNum = 4;
// cufftComplex *CompData;
// cudaMalloc((void**)&CompData, LEN * sizeof(cufftComplex));
int_cufftComplex<<<blockNum, threadNum>>>(x1, CompData, LEN);
// int_cufftComplex<<<1, 8>>>(x1, CompData, LEN);
// cufftHandle plan;// cuda library function handle
// cufftPlan1d(&plan, LEN, CUFFT_C2C, 1);//declaration,这句要warm-up
cufftExecC2C(plan, (cufftComplex*)CompData, (cufftComplex*)CompData, CUFFT_FORWARD);//execute
cudaDeviceSynchronize();//wait to be done
cufftComplex_int<<<blockNum, threadNum>>>(CompData, x1, LEN);
// cufftComplex_int<<<1, 8>>>(CompData, x1, LEN);
cudaMemcpy(y, x1, LEN * sizeof(int32_t), cudaMemcpyDeviceToHost);// copy the result from device to host
// printf("hs1111111111111111:\n");
// cufftDestroy(plan);
// cudaFree(CompData);
// cudaFree(x1);
}
void load_cuFFT(void)
{
initcudft();
int16_t *a = (int16_t *)malloc(LEN * sizeof(int32_t));
int16_t *b = (int16_t *)malloc(LEN * sizeof(int32_t));
cudft2048(a,b,1);
}
int main()
{
load_cuFFT();
int16_t *a = (int16_t *)malloc(LEN * sizeof(int32_t));
int16_t *b = (int16_t *)malloc(LEN * sizeof(int32_t));
for (int i = 0; i < LEN; i++)
{
*(a+2*i) = i;
*(a+2*i+1) = LEN-i;
}
for (int i = 0; i < 3; i++)
{
cudft2048((int16_t *)a,(int16_t *)b,0);
printf("hs1111111111111111:\n");
for (int j = 0; j < LEN; j++)
{
printf("a=%d + %dj\tb=%d + %dj\n", a[j*2],a[j*2+1],b[j*2],b[j*2+1]);
}
}
destroycudft();
}
\ No newline at end of file
......@@ -262,7 +262,7 @@ THREAD_STRUCT = (
#three config for level of parallelism "PARALLEL_SINGLE_THREAD", "PARALLEL_RU_L1_SPLIT", or "PARALLEL_RU_L1_TRX_SPLIT"
parallel_config = "PARALLEL_SINGLE_THREAD";
#two option for worker "WORKER_DISABLE" or "WORKER_ENABLE"
worker_config = "WORKER_ENABLE";
worker_config = "WORKER_DISABLE";
}
);
......
Markdown is supported
0%
or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment