Commit 82b32199 authored by ISIP CS/NCTU's avatar ISIP CS/NCTU

GPU part uploading

parent ae0494b0
#ifndef CUDA_debug
//#define CUDA_debug
#endif
#ifndef CUDA
#define CUDA
#endif
\ No newline at end of file
#ifndef __INIT__DEFS_CU__H__
#define __INIT__DEFS_CU__H__
#include <stdint.h>
#include <stdio.h>
#include "PHY/impl_defs_lte.h"
#include "PHY/defs.h"
#ifdef __cplusplus
extern "C"
#endif
void init_cuda( PHY_VARS_eNB *phy_vars_eNB, LTE_DL_FRAME_PARMS frame_parms );
#ifdef __cplusplus
extern "C"
#endif
void free_cufft();
#endif
#include "defs.h"
#include "PHY/CUDA/extern.h"
#include "PHY/LTE_TRANSPORT/extern.h"
#include <stdio.h>
int device_count;
dl_cu_t dl_cu[10];
ul_cu_t ul_cu[10];
estimation_const_t esti_const;
para_ulsch ulsch_para[10];
ext_rbs ext_rbs_para[10];
void init_cufft( void );
void free_cufft( void );
__global__ void generate_ul_ref_sigs_rx_cu( float2 *x, unsigned int Msc_RS, unsigned int u, unsigned int v )
{
unsigned short ref_primes[33] = {11,23,31,47,50,71,89,107,113,139,179,191,211,239,283,293,317,359,383,431,479,523,571,599,647,719,863,887,953,971,1069,1151,1193};
float qbar;
float phase;
unsigned short dftsizes[33] = { 12,24,36,48,60,72,96,108,120,144,180,192,216,240,288,300,324,360,384,432,480,540,576,600,648,720,864,900,960,972,1080,1152,1200 };
char ref24[720] = {
-1,3,1,-3,3,-1,1,3,-3,3,1,3,-3,3,1,1,-1,1,3,-3,3,-3,-1,-3,-3,3,-3,-3,-3,1,-3,-3,3,-1,1,1,1,3,1,-1,3,-3,-3,1,3,1,1,-3,3,-1,3,3,1,1,-3,3,3,3,3,1,-1,3,-1,1,1,-1,-3,-1,-1,1,3,3,-1,-3,1,1,3,-3,1,1,-3,-1,-1,1,3,1,3,1,-1,3,1,1,-3,-1,-3,-1,-1,-1,-1,-3,-3,-1,1,1,3,3,-1,3,-1,1,-1,-3,1,-1,-3,-3,1,-3,-1,-1,-3,1,1,3,-1,1,3,1,-3,1,-3,1,1,-1,-1,3,-1,-3,3,-3,-3,-3,1,1,1,1,-1,-1,3,-3,-3,3,-3,1,-1,-1,1,-1,1,1,-1,-3,-1,1,-1,3,-1,-3,-3,3,3,-1,-1,-3,-1,3,1,3,1,3,1,1,-1,3,1,-1,1,3,-3,-1,-1,1,-3,1,3,-3,1,-1,-3,3,-3,3,-1,-1,-1,-1,1,-3,-3,-3,1,-3,-3,-3,1,-3,1,1,-3,3,3,-1,-3,-1,3,-3,3,3,3,-1,1,1,-3,1,-1,1,1,-3,1,1,-1,1,-3,-3,3,-1,3,-1,-1,-3,-3,-3,-1,-3,-3,1,-1,1,3,3,-1,1,-1,3,1,3,3,-3,-3,1,3,1,-1,-3,-3,-3,3,3,-3,3,3,-1,-3,3,-1,1,-3,1,1,3,3,1,1,1,-1,-1,1,-3,3,-1,1,1,-3,3,3,-1,-3,3,-3,-1,-3,-1,-1,-1,-1,-1,-3,-1,3,3,1,-1,1,3,3,3,-1,1,1,-3,1,3,-1,-3,3,-3,-3,3,1,3,1,-3,3,1,3,1,1,3,3,-1,-1,-3,1,-3,-1,3,1,1,3,-1,-1,1,-3,1,3,-3,1,-1,-3,-1,3,1,3,1,-1,-3,-3,-1,-1,-3,-3,-3,-1,-1,-3,3,-1,-1,-1,-1,1,1,-3,3,1,3,3,1,-1,1,-3,1,-3,1,1,-3,-1,1,3,-1,3,3,-1,-3,1,-1,-3,3,3,3,-1,1,1,3,-1,-3,-1,3,-1,-1,-1,1,1,1,1,1,-1,3,-1,-3,1,1,3,-3,1,-3,-1,1,1,-3,-3,3,1,1,-3,1,3,3,1,-1,-3,3,-1,3,3,3,-3,1,-1,1,-1,-3,-1,1,3,-1,3,-3,-3,-1,-3,3,-3,-3,-3,-1,-1,-3,-1,-3,3,1,3,-3,-1,3,-1,1,-1,3,-3,1,-1,-3,-3,1,1,-1,1,-1,1,-1,3,1,-3,-1,1,-1,1,-1,-1,3,3,-3,-1,1,-3,-3,-1,-3,3,1,-1,-3,-1,-3,-3,3,-3,3,-3,-1,1,3,1,-3,1,3,3,-1,-3,-1,-1,-1,-1,3,3,3,1,3,3,-3,1,3,-1,3,-1,3,3,-3,3,1,-1,3,3,1,-1,3,3,-1,-3,3,-3,-1,-1,3,-1,3,-1,-1,1,1,1,1,-1,-1,-3,-1,3,1,-1,1,-1,3,-1,3,1,1,-1,-1,-3,1,1,-3,1,3,-3,1,1,-3,-3,-1,-1,-3,-1,1,3,1,1,-3,-1,-1,-3,3,-3,3,1,-3,3,-3,1,-1,1,-3,1,1,1,-1,-3,3,3,1,1,3,-1,-3,-1,-1,-1,3,1,-3,-3,-1,3,-3,-1,-3,-1,-3,-1,-1,-3,-1,-1,1,-3,-1,-1,1,-1,-3,1,1,-3,1,-3,-3,3,1,1,-1,3,-1,-1,1,1,-1,-1,-3,-1,3,-1,3,-1,1,3,1,-1,3,1,3,-3,-3,1,-1,-1,1,3
};
char ref12[360] = {-1,1,3,-3,3,3,1,1,3,1,-3,3,1,1,3,3,3,-1,1,-3,-3,1,-3,3,1,1,-3,-3,-3,-1,-3,-3,1,-3,1,-1,-1,1,1,1,1,-1,-3,-3,1,-3,3,-1,-1,3,1,-1,1,-1,-3,-1,1,-1,1,3,1,-3,3,-1,-1,1,1,-1,-1,3,-3,1,-1,3,-3,-3,-3,3,1,-1,3,3,-3,1,-3,-1,-1,-1,1,-3,3,-1,1,-3,3,1,1,-3,3,1,-1,-1,-1,1,1,3,-1,1,1,-3,-1,3,3,-1,-3,1,1,1,1,1,-1,3,-1,1,1,-3,-3,-1,-3,-3,3,-1,3,1,-1,-1,3,3,-3,1,3,1,3,3,1,-3,1,1,-3,1,1,1,-3,-3,-3,1,3,3,-3,3,-3,1,1,3,-1,-3,3,3,-3,1,-1,-3,-1,3,1,3,3,3,-1,1,3,-1,1,-3,-1,-1,1,1,3,1,-1,-3,1,3,1,-1,1,3,3,3,-1,-1,3,-1,-3,1,1,3,-3,3,-3,-3,3,1,3,-1,-3,3,1,1,-3,1,-3,-3,-1,-1,1,-3,-1,3,1,3,1,-1,-1,3,-3,-1,-3,-1,-1,-3,1,1,1,1,3,1,-1,1,-3,-1,-1,3,-1,1,-3,-3,-3,-3,-3,1,-1,-3,1,1,-3,-3,-3,-3,-1,3,-3,1,-3,3,1,1,-1,-3,-1,-3,1,-1,1,3,-1,1,1,1,3,1,3,3,-1,1,-1,-3,-3,1,1,-3,3,3,1,3,3,1,-3,-1,-1,3,1,3,-3,-3,3,-3,1,-1,-1,3,-1,-3,-3,-1,-3,-1,-3,3,1,-1,1,3,-3,-3,-1,3,-3,3,-1,3,3,-3,3,3,-1,-1,3,-3,-3,-1,-1,-3,-1,3,-3,3,1,-1};
unsigned int q,m,n;
if( Msc_RS >= 2 )
{
qbar = ref_primes[Msc_RS] * (u+1)/(double)31;
if ((((int)floor(2*qbar))&1) == 0)
q = (int)(floor(qbar+.5)) - v;
else
q = (int)(floor(qbar+.5)) + v;
for (n=0; n<dftsizes[Msc_RS]; n++)
{
m=n%ref_primes[Msc_RS];
phase = (float)q*m*(m+1)/ref_primes[Msc_RS];
x[n].x = cosf(M_PI*phase);
x[n].y =-sinf(M_PI*phase);
}
}
else if ( Msc_RS == 1 )
{
for (n=0; n<dftsizes[1]; n++) {
x[n].x = cosf(M_PI*((float)ref24[(u*24) + n])/4);
x[n].y = sinf(M_PI*((float)ref24[(u*24) + n])/4);
}
}
else if ( Msc_RS == 0 )
{
for (n=0; n<dftsizes[0]; n++) {
x[n].x = cosf(M_PI*ref12[(u*12) + n]/4);
x[n].y = sinf(M_PI*ref12[(u*12) + n]/4);
}
}
}
void init_cuda(PHY_VARS_eNB *phy_vars_eNB, LTE_DL_FRAME_PARMS frame_parms )
{
unsigned short dftsizes[33] = { 12,24,36,48,60,72,96,108,120,144,180,192,216,240,288,300,324,360,384,432,480,540,576,600,648,720,864,900,960,972,1080,1152,1200 };
int i,j,k;
int u,v,Msc_RS;
cudaGetDeviceCount(&device_count);
printf("[CUDA] now we have %d device\n",device_count);
LTE_DL_FRAME_PARMS* const frame_parm = &phy_vars_eNB->lte_frame_parms;
LTE_eNB_COMMON* const eNB_common_vars = &phy_vars_eNB->lte_eNB_common_vars;
LTE_eNB_PUSCH** const eNB_pusch_vars = phy_vars_eNB->lte_eNB_pusch_vars;
LTE_eNB_SRS* const eNB_srs_vars = phy_vars_eNB->lte_eNB_srs_vars;
LTE_eNB_PRACH* const eNB_prach_vars = &phy_vars_eNB->lte_eNB_prach_vars;
for ( i = 0; i < device_count; i++ )
{
cudaDeviceProp deviceProp;
cudaGetDeviceProperties(&deviceProp, i);
printf("[CUDA] device number= %d, device name= %s\n",i, deviceProp.name);
}
for ( Msc_RS = 0; Msc_RS < 2; Msc_RS++ )
{
for ( u = 0; u < 30; u++ )
{
for ( v = 0; v < 1; v++ )
{
cudaMalloc( ( void **)&esti_const.d_ul_ref_sigs_rx[u][v][Msc_RS], 2*sizeof( float2 )*dftsizes[Msc_RS] );
generate_ul_ref_sigs_rx_cu<<< 1, 1>>>( esti_const.d_ul_ref_sigs_rx[u][v][Msc_RS], Msc_RS, u, v );
}
}
}
for ( Msc_RS = 2; Msc_RS < 33; Msc_RS++ )
{
for ( u = 0; u < 30; u++ )
{
for ( v = 0; v < 2; v++ )
{
cudaMalloc( ( void **)&esti_const.d_ul_ref_sigs_rx[u][v][Msc_RS], 2*sizeof( float2 )*dftsizes[Msc_RS] );
generate_ul_ref_sigs_rx_cu<<< 1, 1>>>( esti_const.d_ul_ref_sigs_rx[u][v][Msc_RS], Msc_RS, u, v );
}
}
}
//host mem alloc
/*
int eNB_id, UE_id;
for ( eNB_id = 0; eNB_id < 3; eNB_id++ )
{
printf("Initial host port to device port\n");
printf("Initial RX port\n");
cudaMallocHost((void **) &eNB_common_vars->rxdata_7_5kHz[eNB_id],frame_parm->nb_antennas_rx*sizeof(int*));
cudaMallocHost((void **) &eNB_common_vars->rxdataF[eNB_id], frame_parm->nb_antennas_rx*sizeof(int*));
for ( i = 0; i < frame_parms.nb_antennas_rx; i++ )
{
cudaMallocHost((void **)&eNB_common_vars->rxdata_7_5kHz[eNB_id][i], frame_parm->samples_per_tti*sizeof(int));
cudaMallocHost((void **)&eNB_common_vars->rxdataF[eNB_id][i], 2*sizeof(int)*(frame_parm->ofdm_symbol_size*frame_parm->symbols_per_tti) );
}
printf("Initial TX port\n");
cudaMallocHost((void **)eNB_common_vars->txdataF[eNB_id], frame_parm->nb_antennas_tx*sizeof(int*));
for ( i = 0; i < frame_parms.nb_antennas_rx; i++ )
{
cudaMallocHost((void **)&eNB_common_vars->txdataF[eNB_id][i], 2*(frame_parm->ofdm_symbol_size*frame_parm->symbols_per_tti)*sizeof(int) );
}
}
for ( UE_id = 0; UE_id < NUMBER_OF_UE_MAX; UE_id++ )
{
for ( eNB_id = 0; eNB_id < 3; eNB_id++ )
{
cudaMallocHost((void **) &eNB_pusch_vars[UE_id]->rxdataF_comp[eNB_id], frame_parm->nb_antennas_rx*sizeof(int*));
for ( i = 0; i < frame_parms.nb_antennas_rx; i++ )
{
cudaMallocHost((void **)&eNB_pusch_vars[UE_id]->rxdataF_comp[eNB_id][i], sizeof(int)*frame_parm->N_RB_UL*12*frame_parm->symbols_per_tti );
}
}
}
*/
for ( i = 0; i < 10; i++ )
{
ul_cu[i].CP = frame_parms.nb_prefix_samples;
ul_cu[i].CP0= frame_parms.nb_prefix_samples0;
ul_cu[i].fftsize = frame_parms.ofdm_symbol_size;
ul_cu[i].Ncp = frame_parms.Ncp;
ul_cu[i].symbols_per_tti = frame_parms.symbols_per_tti;
ul_cu[i].samples_per_tti = frame_parms.samples_per_tti;
ul_cu[i].nb_antennas_rx = frame_parms.nb_antennas_rx;
ul_cu[i].N_RB_UL = frame_parms.N_RB_UL;
ul_cu[i].d_rxdata = ( int **)malloc( frame_parms.nb_antennas_rx * sizeof( int *) );
ul_cu[i].d_rxdata_fft = ( float2 **)malloc( frame_parms.nb_antennas_rx * sizeof( float2 *) );
ul_cu[i].d_rxdataF = ( int **)malloc( frame_parms.nb_antennas_rx * sizeof( int *) );
ul_cu[i].d_rxdata_ext = ( float2 **)malloc( frame_parms.nb_antennas_rx * sizeof( float2 *) );
ul_cu[i].d_rxdata_ext_int = ( int **)malloc( frame_parms.nb_antennas_rx * sizeof( int *) );
ul_cu[i].d_rxdata_comp = ( float2 **)malloc( frame_parms.nb_antennas_rx * sizeof( float2 *) );
ul_cu[i].d_rxdata_comp_int = ( int **)malloc( frame_parms.nb_antennas_rx * sizeof( int *) );
ul_cu[i].d_drs_ch = ( float2 **)malloc( frame_parms.nb_antennas_rx * sizeof( float2 *) );
ul_cu[i].d_drs_ch_int = ( int **)malloc( frame_parms.nb_antennas_rx * sizeof( int *) );
ul_cu[i].d_ulsch_power = ( int **)malloc( frame_parms.nb_antennas_rx * sizeof( int *) );
for ( k = 0; k < ul_cu[i].nb_antennas_rx; k++ )
{
if(cudaMalloc(( void **)&ul_cu[i].d_rxdata[k] , sizeof( int )* 15* 512))
printf("error\n");
cudaMalloc(( void **)&ul_cu[i].d_rxdata_fft[k] , sizeof( float2 )* ul_cu[i].symbols_per_tti* ul_cu[i].fftsize);
cudaMalloc(( void **)&ul_cu[i].d_rxdataF[k] , 2* sizeof( int )* ul_cu[i].symbols_per_tti* ul_cu[i].fftsize );
cudaMalloc(( void **)&ul_cu[i].d_rxdata_ext[k] , sizeof( float2 )* frame_parms.N_RB_UL* 12* frame_parms.symbols_per_tti );
cudaMalloc(( void **)&ul_cu[i].d_rxdata_ext_int[k] , sizeof( int )* frame_parms.N_RB_UL* 12* frame_parms.symbols_per_tti );
cudaMalloc(( void **)&ul_cu[i].d_rxdata_comp[k] , sizeof( float2 )* frame_parms.N_RB_UL* 12* frame_parms.symbols_per_tti );
cudaMalloc(( void **)&ul_cu[i].d_rxdata_comp_int[k], sizeof( int )* frame_parms.N_RB_UL* 12* frame_parms.symbols_per_tti );
cudaMalloc(( void **)&ul_cu[i].d_drs_ch[k] , sizeof( float2 )* frame_parms.N_RB_UL* 12* frame_parms.symbols_per_tti );
cudaMalloc(( void **)&ul_cu[i].d_drs_ch_int[k] , sizeof( int )* frame_parms.N_RB_UL* 12* frame_parms.symbols_per_tti + 1 );
cudaMalloc(( void **)&ul_cu[i].d_ulsch_power[k] , sizeof( int ) );
}
dl_cu[i].CP = frame_parms.nb_prefix_samples;
dl_cu[i].CP0= frame_parms.nb_prefix_samples0;
dl_cu[i].ifftsize = frame_parms.ofdm_symbol_size;
dl_cu[i].Ncp = frame_parms.Ncp;
dl_cu[i].symbols_per_tti = frame_parms.symbols_per_tti;
dl_cu[i].samples_per_tti = frame_parms.samples_per_tti;
}
printf("[CUDA] CP0=%d, CP=%d, fftsize=%d, symbols_per_tti=%d, samples_per_tti=%d\n",ul_cu[i].CP0,ul_cu[i].CP,frame_parms.ofdm_symbol_size,frame_parms.symbols_per_tti,frame_parms.samples_per_tti);
init_cufft( );
}
void init_cufft( void )
{
//initial cufft plan fft128, fft256, fft512, fft1024, fft1536, fft2048
int i,j;
short fftsize = ul_cu[i].fftsize;
short Ncp = ul_cu[i].Ncp;
short symbols_per_tti = ul_cu[i].symbols_per_tti;
short samples_per_tti = ul_cu[i].samples_per_tti;
for ( i = 0; i < 10; i++ )
{
//for ul cuda
cudaStreamCreateWithFlags( &( ul_cu[i].stream_ul ), cudaStreamNonBlocking );
cudaStreamCreateWithFlags( &( ul_cu[i].tempstrm ), cudaStreamNonBlocking );
cufftPlan1d( &( ul_cu[i].fft ) , fftsize ,CUFFT_C2C, symbols_per_tti);
cufftSetStream( ul_cu[i].fft , ul_cu[i].stream_ul );
cudaStreamCreateWithFlags( &( ul_cu[i].timing_advance ), cudaStreamNonBlocking );
cufftPlan1d( &( ul_cu[i].ifft_timing_advance ) , fftsize ,CUFFT_C2C, symbols_per_tti);
cufftSetStream( ul_cu[i].ifft_timing_advance , ul_cu[i].timing_advance );
//for dl cuda
cudaStreamCreateWithFlags( &( dl_cu[i].stream_dl ), cudaStreamNonBlocking );
cufftPlan1d( &( dl_cu[i].ifft ) , fftsize ,CUFFT_C2C, symbols_per_tti);
cudaMalloc((void **)&(dl_cu[i].d_txdata) , sizeof( short )*(symbols_per_tti+1)* 2* symbols_per_tti*fftsize);
cudaMalloc((void **)&(dl_cu[i].d_txdata_o) , sizeof( short )* samples_per_tti* 2 );
cudaMalloc((void **)&(dl_cu[i].d_txdata_ifft), sizeof( float2 )* symbols_per_tti* fftsize);
cudaMallocHost((void **)&(dl_cu[i].h_txdata) , sizeof( short )* symbols_per_tti* 2* fftsize);
cufftSetStream( dl_cu[i].ifft , dl_cu[i].stream_dl );
}
for ( i = 0; i < 10; i++ )
{
cufftPlan1d( &( ul_cu[i].idft.fft12 ) , 12 ,CUFFT_C2C, 14 );
cufftPlan1d( &( ul_cu[i].idft.fft24 ) , 24 ,CUFFT_C2C, 14 );
cufftPlan1d( &( ul_cu[i].idft.fft36 ) , 36 ,CUFFT_C2C, 14 );
cufftPlan1d( &( ul_cu[i].idft.fft48 ) , 48 ,CUFFT_C2C, 14 );
cufftPlan1d( &( ul_cu[i].idft.fft60 ) , 60 ,CUFFT_C2C, 14 );
cufftPlan1d( &( ul_cu[i].idft.fft72 ) , 72 ,CUFFT_C2C, 14 );
cufftPlan1d( &( ul_cu[i].idft.fft84 ) , 84 ,CUFFT_C2C, 14 );
cufftPlan1d( &( ul_cu[i].idft.fft96 ) , 96 ,CUFFT_C2C, 14 );
cufftPlan1d( &( ul_cu[i].idft.fft108 ) , 108 ,CUFFT_C2C, 14 );
cufftPlan1d( &( ul_cu[i].idft.fft120 ) , 120 ,CUFFT_C2C, 14 );
cufftPlan1d( &( ul_cu[i].idft.fft132 ) , 132 ,CUFFT_C2C, 14 );
cufftPlan1d( &( ul_cu[i].idft.fft144 ) , 144 ,CUFFT_C2C, 14 );
cufftPlan1d( &( ul_cu[i].idft.fft156 ) , 156 ,CUFFT_C2C, 14 );
cufftPlan1d( &( ul_cu[i].idft.fft168 ) , 168 ,CUFFT_C2C, 14 );
cufftPlan1d( &( ul_cu[i].idft.fft180 ) , 180 ,CUFFT_C2C, 14 );
cufftPlan1d( &( ul_cu[i].idft.fft192 ) , 192 ,CUFFT_C2C, 14 );
cufftPlan1d( &( ul_cu[i].idft.fft204 ) , 204 ,CUFFT_C2C, 14 );
cufftPlan1d( &( ul_cu[i].idft.fft216 ) , 216 ,CUFFT_C2C, 14 );
cufftPlan1d( &( ul_cu[i].idft.fft228 ) , 228 ,CUFFT_C2C, 14 );
cufftPlan1d( &( ul_cu[i].idft.fft240 ) , 240 ,CUFFT_C2C, 14 );
cufftPlan1d( &( ul_cu[i].idft.fft252 ) , 252 ,CUFFT_C2C, 14 );
cufftPlan1d( &( ul_cu[i].idft.fft264 ) , 264 ,CUFFT_C2C, 14 );
cufftPlan1d( &( ul_cu[i].idft.fft276 ) , 276 ,CUFFT_C2C, 14 );
cufftPlan1d( &( ul_cu[i].idft.fft288 ) , 288 ,CUFFT_C2C, 14 );
cufftPlan1d( &( ul_cu[i].idft.fft300 ) , 300 ,CUFFT_C2C, 14 );
cufftSetStream( ul_cu[i].idft.fft12 , ul_cu[i].stream_ul );
cufftSetStream( ul_cu[i].idft.fft24 , ul_cu[i].stream_ul );
cufftSetStream( ul_cu[i].idft.fft36 , ul_cu[i].stream_ul );
cufftSetStream( ul_cu[i].idft.fft48 , ul_cu[i].stream_ul );
cufftSetStream( ul_cu[i].idft.fft60 , ul_cu[i].stream_ul );
cufftSetStream( ul_cu[i].idft.fft72 , ul_cu[i].stream_ul );
cufftSetStream( ul_cu[i].idft.fft84 , ul_cu[i].stream_ul );
cufftSetStream( ul_cu[i].idft.fft96 , ul_cu[i].stream_ul );
cufftSetStream( ul_cu[i].idft.fft108 , ul_cu[i].stream_ul );
cufftSetStream( ul_cu[i].idft.fft120 , ul_cu[i].stream_ul );
cufftSetStream( ul_cu[i].idft.fft132 , ul_cu[i].stream_ul );
cufftSetStream( ul_cu[i].idft.fft144 , ul_cu[i].stream_ul );
cufftSetStream( ul_cu[i].idft.fft156 , ul_cu[i].stream_ul );
cufftSetStream( ul_cu[i].idft.fft168 , ul_cu[i].stream_ul );
cufftSetStream( ul_cu[i].idft.fft180 , ul_cu[i].stream_ul );
cufftSetStream( ul_cu[i].idft.fft192 , ul_cu[i].stream_ul );
cufftSetStream( ul_cu[i].idft.fft204 , ul_cu[i].stream_ul );
cufftSetStream( ul_cu[i].idft.fft216 , ul_cu[i].stream_ul );
cufftSetStream( ul_cu[i].idft.fft228 , ul_cu[i].stream_ul );
cufftSetStream( ul_cu[i].idft.fft240 , ul_cu[i].stream_ul );
cufftSetStream( ul_cu[i].idft.fft252 , ul_cu[i].stream_ul );
cufftSetStream( ul_cu[i].idft.fft264 , ul_cu[i].stream_ul );
cufftSetStream( ul_cu[i].idft.fft276 , ul_cu[i].stream_ul );
cufftSetStream( ul_cu[i].idft.fft288 , ul_cu[i].stream_ul );
cufftSetStream( ul_cu[i].idft.fft300 , ul_cu[i].stream_ul );
}
}
void free_cufft(void)
{
int i, j;
for ( i = 0; i < 10; i++ )
{
//for ul cuda
cudaFree(ul_cu[i].d_rxdata);
cudaFree(ul_cu[i].d_rxdata_fft);
cufftDestroy(ul_cu[i].fft);
cudaStreamDestroy(ul_cu[i].stream_ul);
//for dl cuda
cudaFree(dl_cu[i].d_txdata);
cudaFree(dl_cu[i].d_txdata_o);
cudaFree(dl_cu[i].d_txdata_ifft);
cudaFreeHost(dl_cu[i].h_txdata);
cufftDestroy(dl_cu[i].ifft);
cudaStreamDestroy(dl_cu[i].stream_dl);
}
cudaDeviceReset();
printf("end cuda\n");
}
#include <stdint.h>
#include <stdio.h>
#include "assertions.h"
#ifdef __cplusplus
extern "C"
#endif
void ulsch_extract_rb_and_compensation_cu( unsigned int first_rb,
unsigned int nb_rb,
unsigned short first_carrier_offset,
unsigned short number_symbols,
unsigned short sf);
#ifdef __cplusplus
extern "C"
#endif
void ulsch_channel_compensation_cu( short sf, short cyclic_shift, int *out, int *out2, int *u, int *v, int Msc_RS, short const_shift);
#ifdef __cplusplus
extern "C"
#endif
void ulsch_extract_rb_cu( unsigned int first_rb,
unsigned int nb_rb,
unsigned short number_symbols,
unsigned short sf);
#ifdef __cplusplus
extern "C"
#endif
void exrb_compen_esti_cu( unsigned int first_rb,
unsigned int nb_rb,
unsigned short number_symbols,
unsigned short sf);
#ifdef __cplusplus
extern "C"
#endif
void estimation_cu( unsigned int first_rb,
unsigned int nb_rb,
unsigned short number_symbols,
unsigned short sf);
#ifdef __cplusplus
extern "C"
#endif
void compensation_cu( unsigned int first_rb,
unsigned int nb_rb,
unsigned short number_symbols,
short Qm,
unsigned short sf);
#ifdef __cplusplus
extern "C"
#endif
void idft_cu(unsigned int first_rb,
unsigned int nb_rb,
unsigned short number_symbols,
short cl,
unsigned short sf
);
#include "defs.h"
#include "PHY/CUDA/extern.h"
#ifndef CUFFT_H
#define CUFFT_H
#include "cufft.h"
#endif
#define ccmax(a,b) ((a>b) ? (a) : (b))
#define ccmin(a,b) ((a<b) ? (a) : (b))
__global__ void k_short_12( float2 *x, short *y, int *sig_eng, float2 *rxF, unsigned int first_rb);
__global__ void k_short_24( float2 *x, short *y, int *sig_eng, float2 *rxF, unsigned int first_rb);
__global__ void k_short_36( float2 *x, short *y, int *sig_eng, float2 *rxF, unsigned int first_rb);
__global__ void k_short_48( float2 *x, short *y, int *sig_eng, float2 *rxF, unsigned int first_rb);
__global__ void k_short_60( float2 *x, short *y, int *sig_eng, float2 *rxF, unsigned int first_rb);
__global__ void k_short_72( float2 *x, short *y, int *sig_eng, float2 *rxF, unsigned int first_rb);
__global__ void k_short_96( float2 *x, short *y, int *sig_eng, float2 *rxF, unsigned int first_rb);
__global__ void k_short_108( float2 *x, short *y, int *sig_eng, float2 *rxF, unsigned int first_rb);
__global__ void k_short_120( float2 *x, short *y, int *sig_eng, float2 *rxF, unsigned int first_rb);
__global__ void k_short_144( float2 *x, short *y, int *sig_eng, float2 *rxF, unsigned int first_rb);
__global__ void k_short_180( float2 *x, short *y, int *sig_eng, float2 *rxF, unsigned int first_rb);
__global__ void k_short_192( float2 *x, short *y, int *sig_eng, float2 *rxF, unsigned int first_rb);
__global__ void k_short_216( float2 *x, short *y, int *sig_eng, float2 *rxF, unsigned int first_rb);
__global__ void k_short_240( float2 *x, short *y, int *sig_eng, float2 *rxF, unsigned int first_rb);
__global__ void k_short_288( float2 *x, short *y, int *sig_eng, float2 *rxF, unsigned int first_rb);
__global__ void k_short_300( float2 *x, short *y, int *sig_eng, float2 *rxF, unsigned int first_rb);
__global__ void exrb_compen_esti( float2 *x,
float2 *ul_ref1,
float2 *ul_ref2,
float2 *out,
int *sig_engery,
const unsigned int first_rb,
short cyclic_shift1,
short cyclic_shift2,
short Msc_RS)
{
const int symbol_id = blockIdx.x;
const int re_id = threadIdx.x;
const int tidy = threadIdx.y;
float2 rxdataF_ext;
float2 pilot_data1;
float2 pilot_data2;
float2 drs_ch;
__shared__ float power[600];
__shared__ int power1[600];
int cs,k, channel_level;
float phase, current_phase1, current_phase2;
float const_value = 22.627417;
float2 out_temp;
float cs_re[12] = { 1, 0.866025, 0.5, 0, -0.5, -0.866025, -1, -0.866025, -0.5, 0, 0.5, 0.866025};
float cs_im[12] = { 0, 0.5, 0.866025, 1, 0.866025, 0.5, 0, -0.5, -0.866025, -1, -0.866025, -0.5};
int mag;
int temp_re, temp_im;
short inv_ch[257] = {512,256,170,128,102,85,73,64,56,51,46,42,39,36,34,32,30,28,26,25,24,23,22,21,20,19,18,18,17,17,16,16,15,15,14,14,13,13,13,12,12,12,11,11,11,11,10,10,10,10,10,
9,9,9,9,9,8,8,8,8,8,8,8,8,7,7,7,7,7,7,7,7,7,6,6,6,6,6,6,6,6,6,6,6,6,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,4,4,4,4,4,4,4,4,4,4,4,4,4,4,4,4,4,4,4,4,4,4,4,4,4,4,3,3,
3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,
2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,1};
int const_shift = 0;
int i;
unsigned int xcl;
unsigned char l2;
int pos = (362 + first_rb * 12 + re_id)%512;
pos = ( pos >= 150 && pos < 362)? pos+212: pos;
rxdataF_ext = x[ symbol_id * 512 + pos ];
pilot_data1 = x[ 1536 + pos ];
pilot_data2 = x[ 5120 + pos ];
out_temp.x = (pilot_data1.x * ul_ref1[re_id].x + pilot_data1.y * ul_ref1[re_id].y)*0.0441941738;
out_temp.y = (pilot_data1.y * ul_ref1[re_id].x - pilot_data1.x * ul_ref1[re_id].y)*0.0441941738;
cs = ( re_id * cyclic_shift1 )%12;
pilot_data1.x = out_temp.x * cs_re[cs] + out_temp.y * cs_im[cs];
pilot_data1.y = out_temp.y * cs_re[cs] - out_temp.x * cs_im[cs];
out_temp.x = (pilot_data2.x * ul_ref2[re_id].x + pilot_data2.y * ul_ref2[re_id].y)*0.0441941738;
out_temp.y = (pilot_data2.y * ul_ref2[re_id].x - pilot_data2.x * ul_ref2[re_id].y)*0.0441941738;
cs = ( re_id * cyclic_shift2 )%12;
pilot_data2.x = out_temp.x * cs_re[cs] + out_temp.y * cs_im[cs];
pilot_data2.y = out_temp.y * cs_re[cs] - out_temp.x * cs_im[cs];
switch ( tidy )
{
case 0: power[re_id<<1] = pilot_data2.x * pilot_data1.x + pilot_data2.y * pilot_data1.y; break;
case 1: power[(re_id<<1)+1] = pilot_data2.y * pilot_data1.x - pilot_data2.x * pilot_data1.y; break;
}
__syncthreads();
for ( k = Msc_RS>>1; k > 0; k=k>>1 )
{
if ( re_id < k )
power[( re_id<<1)+ tidy] = power[(re_id<<1)+tidy] + power[((k+re_id)<<1)+tidy];
__syncthreads();
if ( k % 2 && re_id == 0 )
power[tidy] = power[tidy] + power[((k-1)<<1)+tidy];
__syncthreads();
}
phase = atanf( power[1]/power[0] );
if ( symbol_id != 10 && symbol_id != 3 )
{
current_phase1 = (phase/7)*(symbol_id- 3);
current_phase2 = (phase/7)*(symbol_id- 10);
drs_ch.x = ((pilot_data1.x * cosf(current_phase1) - pilot_data1.y * sinf(current_phase1)) +
(pilot_data2.x * cosf(current_phase2) - pilot_data2.y * sinf(current_phase2)))/2;
drs_ch.y = ((pilot_data1.y * cosf(current_phase1) + pilot_data1.x * sinf(current_phase1)) +
(pilot_data2.y * cosf(current_phase2) + pilot_data2.x * sinf(current_phase2)))/2;
switch(tidy)
{
case 0: power1[re_id<<1] = ((short)drs_ch.x * (short)drs_ch.x + (short)drs_ch.y * (short)drs_ch.y); break;
case 1: power1[(re_id<<1)+1] = ((short)drs_ch.x * (short)drs_ch.x + (short)drs_ch.y * (short)drs_ch.y)>>4; break;
}
__syncthreads();
for ( k = Msc_RS>>1; k > 0; k=k>>1 )
{
if ( re_id < k )
power1[(re_id<<1)+tidy] = power1[(re_id<<1)+tidy] + power1[((k+re_id)<<1)+tidy];
__syncthreads();
if ( k % 2 && re_id == 0 )
power1[tidy] = power1[tidy] + power1[((k-1)<<1)+tidy];
__syncthreads();
}
xcl = (unsigned int)(power1[0]/(Msc_RS<<1));
l2=0;
for (i=0; i<31; i++)
if ((xcl&(1<<i)) != 0)
l2 = i+1;
channel_level = (short)(l2>>1) + 4;
mag = ((int)(drs_ch.x * drs_ch.x + drs_ch.y * drs_ch.y))>>channel_level;
mag = ( mag >= 255 )? 255: mag;
switch ( tidy )
{
case 0:
out[symbol_id*Msc_RS+re_id].x = (float)((((int)(((rxdataF_ext.x * drs_ch.x) + ( rxdataF_ext.y * drs_ch.y ))*0.0441941738))>>channel_level)*inv_ch[mag]);
break;
case 1:
out[symbol_id*Msc_RS+re_id].y = (float)((((int)(((rxdataF_ext.y * drs_ch.x) - ( rxdataF_ext.x * drs_ch.y ))*0.0441941738))>>channel_level)*inv_ch[mag]);
break;
}
if(tidy == 0 && re_id == 0 && symbol_id == 0)
sig_engery[0] = (int)power1[1]*8/Msc_RS;
}
else if ( symbol_id == 3 )
{
out[symbol_id * 300 + re_id] = pilot_data1;
}
else if ( symbol_id == 10 )
{
out[symbol_id * 300 + re_id] = pilot_data2;
}
}
__global__ void exrb( float2 *x,
float2 *out,
short *out2,
const unsigned int first_rb,
const short Msc_RS
)
{
const int symbol_id = blockIdx.x;
const int re_id = threadIdx.x;
const int tidy = threadIdx.y;
float2 rxdataF_ext;
float2 pilot_data1;
float2 pilot_data2;
int pos = (362 + first_rb * 12 + re_id)%512;
pos = ( pos >= 150 && pos < 362)? pos+212: pos;
rxdataF_ext.x = x[ symbol_id * 512 + pos ].x*0.0441941738;
rxdataF_ext.y = x[ symbol_id * 512 + pos ].y*0.0441941738;
pilot_data1.x = x[ 1536 + pos ].x*0.0441941738;
pilot_data1.y = x[ 1536 + pos ].y*0.0441941738;
pilot_data2.x = x[ 5120 + pos ].x*0.0441941738;
pilot_data2.y = x[ 5120 + pos ].y*0.0441941738;
out[symbol_id * 300 + re_id] = rxdataF_ext;
out2[((symbol_id*300+re_id)<<1)] = (short)rxdataF_ext.x;
out2[((symbol_id*300+re_id)<<1)+1] = (short)rxdataF_ext.y;
}
__global__ void estimation( float2 *x,
float2 *ul_ref1,
float2 *ul_ref2,
float2 *out,
short *out2,
short cyclic_shift1,
short cyclic_shift2,
short Msc_RS
)
{
const int symbol_id = blockIdx.x;
const int re_id = threadIdx.x;
const int tidy = threadIdx.y;
int cs,k, channel_level;
float phase, current_phase1, current_phase2;
float const_value = 22.627417;
float2 out_temp;
float cs_re[12] = { 1, 0.866025, 0.5, 0, -0.5, -0.866025, -1, -0.866025, -0.5, 0, 0.5, 0.866025};
float cs_im[12] = { 0, 0.5, 0.866025, 1, 0.866025, 0.5, 0, -0.5, -0.866025, -1, -0.866025, -0.5};
float2 drs_ch;
float2 pilot_data1;
float2 pilot_data2;
__shared__ float power[600];
pilot_data1 = x[900 + re_id];
pilot_data2 = x[3000+ re_id];
out_temp.x = pilot_data1.x * ul_ref1[re_id].x + pilot_data1.y * ul_ref1[re_id].y;
out_temp.y = pilot_data1.y * ul_ref1[re_id].x - pilot_data1.x * ul_ref1[re_id].y;
cs = ( re_id * cyclic_shift1 )%12;
pilot_data1.x = out_temp.x * cs_re[cs] + out_temp.y * cs_im[cs];
pilot_data1.y = out_temp.y * cs_re[cs] - out_temp.x * cs_im[cs];
out_temp.x = pilot_data2.x * ul_ref2[re_id].x + pilot_data2.y * ul_ref2[re_id].y;
out_temp.y = pilot_data2.y * ul_ref2[re_id].x - pilot_data2.x * ul_ref2[re_id].y;
cs = ( re_id * cyclic_shift2 )%12;
pilot_data2.x = out_temp.x * cs_re[cs] + out_temp.y * cs_im[cs];
pilot_data2.y = out_temp.y * cs_re[cs] - out_temp.x * cs_im[cs];
if ( tidy == 0 )
power[re_id<<1] = pilot_data2.x * pilot_data1.x + pilot_data2.y * pilot_data1.y;
else
power[(re_id<<1)+1] = pilot_data2.y * pilot_data1.x - pilot_data2.x * pilot_data1.y;
__syncthreads();
for ( k = Msc_RS>>1; k > 0; k=k>>1 )
{
if ( re_id < k )
power[( re_id<<1)+ tidy] = power[(re_id<<1)+tidy] + power[((k+re_id)<<1)+tidy];
__syncthreads();
if ( k % 2 && re_id == 0 )
power[tidy] = power[tidy] + power[((k-1)<<1)+tidy];
__syncthreads();
}
phase = atanf( power[1]/power[0] );
if ( symbol_id != 10 && symbol_id != 3 )
{
current_phase1 = (phase/7)*(symbol_id- 3);
current_phase2 = (phase/7)*(symbol_id- 10);
drs_ch.x = ((pilot_data1.x * cosf(current_phase1) - pilot_data1.y * sinf(current_phase1)) +
(pilot_data2.x * cosf(current_phase2) - pilot_data2.y * sinf(current_phase2)))/2;
drs_ch.y = ((pilot_data1.y * cosf(current_phase1) + pilot_data1.x * sinf(current_phase1)) +
(pilot_data2.y * cosf(current_phase2) + pilot_data2.x * sinf(current_phase2)))/2;
out[symbol_id*300+re_id] = drs_ch;
out2[((symbol_id*300+re_id)<<1)] = (short)drs_ch.x;
out2[((symbol_id*300+re_id)<<1)+1] = (short)drs_ch.y;
}
else if ( symbol_id == 3 )
{
out[symbol_id * 300 + re_id] = pilot_data1;
out2[((symbol_id*300+re_id)<<1)] = (short)pilot_data1.x;
out2[((symbol_id*300+re_id)<<1)+1] = (short)pilot_data1.y;
}
else if ( symbol_id == 10 )
{
out[symbol_id * 300 + re_id] = pilot_data2;
out2[((symbol_id*300+re_id)<<1)] = (short)pilot_data2.x;
out2[((symbol_id*300+re_id)<<1)+1] = (short)pilot_data2.y;
}
}
/*__global__ void add_value( int *x, int *y )
{
x[14*300] = y[0];
}*/
__global__ void compensation( float2 *x,
short *xt,
float2 *drs,
short *drst,
float Qm_const,
float2 *out,
short *out2,
int *sig_engery,
short Msc_RS
)
{
const int symbol_id = blockIdx.x;
const int re_id = threadIdx.x;
const int tidy = threadIdx.y;
int k, channel_level,mag;
float2 out_temp;
int temp_re, temp_im;
__shared__ int power[600];
/*short inv_ch[257] = {512,256,170,128,102,85,73,64,56,51,46,42,39,36,34,32,30,28,26,25,24,23,22,21,20,19,18,18,17,17,16,16,15,15,14,14,13,13,13,12,12,12,11,11,11,11,10,10,10,10,10,
9,9,9,9,9,8,8,8,8,8,8,8,8,7,7,7,7,7,7,7,7,7,6,6,6,6,6,6,6,6,6,6,6,6,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,4,4,4,4,4,4,4,4,4,4,4,4,4,4,4,4,4,4,4,4,4,4,4,4,4,4,3,3,
3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,
2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,1};*/
float sig_eng,inv_ch;
float2 rxdataF_ext;
int const_shift = 0;
float2 drs_ch;
int i;
unsigned int xcl;
unsigned char l2;
short clr;
short cli;
/*rxdataF_ext.x = xt[(symbol_id * 300 + re_id)<<1];
rxdataF_ext.y = xt[((symbol_id * 300 + re_id)<<1)+1];
drs_ch.x = drst[(symbol_id * 300 + re_id)<<1];
drs_ch.y = drst[((symbol_id * 300 + re_id)<<1)+1];*/
clr = drst[(symbol_id * 300 + re_id)<<1];
cli = drst[((symbol_id * 300 + re_id)<<1)+1];
drs_ch = drs[symbol_id * 300 + re_id];
rxdataF_ext = x[symbol_id * 300 + re_id];
if ( symbol_id != 10 && symbol_id != 3 )
{
switch(tidy)
{
case 0: power[re_id<<1] = (clr * clr + cli * cli); break;
case 1: power[(re_id<<1)+1] = (clr * clr + cli * cli)>>4; break;
}
//power[re_id] = (clr * clr + cli * cli);
__syncthreads();
for ( k = Msc_RS>>1; k > 0; k=k>>1 )
{
if ( re_id < k )
power[(re_id<<1)+tidy] = power[(re_id<<1)+tidy] + power[((k+re_id)<<1)+tidy];
__syncthreads();
if ( k % 2 && re_id == 0 )
power[tidy] = power[tidy] + power[((k-1)<<1)+tidy];
__syncthreads();
}
//xcl = (unsigned int)(power[0]/(Msc_RS<<1));
//l2=0;
//for (i=0; i<31; i++)
//if ((xcl&(1<<i)) != 0)
//l2 = i+1;
//channel_level = (short)(l2>>1) + 4;
sig_eng = power[1]*8/Msc_RS;
mag = drs_ch.x * drs_ch.x + drs_ch.y * drs_ch.y;
inv_ch = 512/(sqrtf(mag)*Qm_const);
inv_ch = ( inv_ch > 512 )? 512:inv_ch;
inv_ch = ( inv_ch < 1 )? 1:inv_ch;
switch ( tidy )
{
case 0:
out[symbol_id*Msc_RS+re_id].x = (((rxdataF_ext.x * drs_ch.x) + ( rxdataF_ext.y * drs_ch.y ))*inv_ch)/sqrtf(mag);
out2[(symbol_id*300+re_id)<<1] = (short)out[symbol_id*Msc_RS+re_id].x;
break;
case 1:
out[symbol_id*Msc_RS+re_id].y = (((rxdataF_ext.y * drs_ch.x) - ( rxdataF_ext.x * drs_ch.y ))*inv_ch)/sqrtf(mag);
out2[((symbol_id*300+re_id)<<1)+1] = (short)out[symbol_id*Msc_RS+re_id].y;
break;
}
if(tidy == 0 && re_id == 0 && symbol_id == 0)
sig_engery[0] = (int)sig_eng;
}
}
void exrb_compen_esti_cu( unsigned int first_rb,
unsigned int nb_rb,
unsigned short number_symbols,
unsigned short sf)
{
dim3 block( number_symbols, 1, 1 );
dim3 thread( ulsch_para[sf].Msc_RS, 2, 1 );
//printf("[TEST]using RB = %d\n",nb_rb);
exrb_compen_esti<<< block, thread, 0, ul_cu[sf].stream_ul>>>
( ul_cu[sf].d_rxdata_fft[0],
esti_const.d_ul_ref_sigs_rx[ulsch_para[sf].u1][ulsch_para[sf].v1][ulsch_para[sf].Msc_RS_idx],
esti_const.d_ul_ref_sigs_rx[ulsch_para[sf].u2][ulsch_para[sf].v2][ulsch_para[sf].Msc_RS_idx],
ul_cu[sf].d_rxdata_comp[0],
ul_cu[sf].d_ulsch_power[0],
first_rb,
ulsch_para[sf].cyclic_shift1,
ulsch_para[sf].cyclic_shift2,
ulsch_para[sf].Msc_RS);
int aarx = 0;
switch ( ulsch_para[sf].Msc_RS )
{
case 12:
cufftExecC2C( ul_cu[sf].idft.fft12,
(cufftComplex *) ul_cu[sf].d_rxdata_comp[aarx],
(cufftComplex *) ul_cu[sf].d_rxdata_comp[aarx],
CUFFT_INVERSE);
k_short_12<<< number_symbols, 300, 0, ul_cu[sf].stream_ul >>>( ul_cu[sf].d_rxdata_comp[aarx], (short*)ul_cu[sf].d_rxdata_comp_int[aarx],ul_cu[sf].d_ulsch_power[0],ul_cu[sf].d_rxdata_fft[0],first_rb);
break;
case 24:
cufftExecC2C( ul_cu[sf].idft.fft24,
(cufftComplex *) ul_cu[sf].d_rxdata_comp[aarx],
(cufftComplex *) ul_cu[sf].d_rxdata_comp[aarx],
CUFFT_INVERSE);
k_short_24<<< number_symbols, 300, 0, ul_cu[sf].stream_ul >>>( ul_cu[sf].d_rxdata_comp[aarx], (short*)ul_cu[sf].d_rxdata_comp_int[aarx],ul_cu[sf].d_ulsch_power[0],ul_cu[sf].d_rxdata_fft[0],first_rb);
break;
case 36:
cufftExecC2C( ul_cu[sf].idft.fft36,
(cufftComplex *) ul_cu[sf].d_rxdata_comp[aarx],
(cufftComplex *) ul_cu[sf].d_rxdata_comp[aarx],
CUFFT_INVERSE);
k_short_36<<< number_symbols, 300, 0, ul_cu[sf].stream_ul >>>( ul_cu[sf].d_rxdata_comp[aarx], (short*)ul_cu[sf].d_rxdata_comp_int[aarx],ul_cu[sf].d_ulsch_power[0],ul_cu[sf].d_rxdata_fft[0],first_rb);
break;
case 48:
cufftExecC2C( ul_cu[sf].idft.fft48,
(cufftComplex *) ul_cu[sf].d_rxdata_comp[aarx],
(cufftComplex *) ul_cu[sf].d_rxdata_comp[aarx],
CUFFT_INVERSE);
k_short_48<<< number_symbols, 300, 0, ul_cu[sf].stream_ul >>>( ul_cu[sf].d_rxdata_comp[aarx], (short*)ul_cu[sf].d_rxdata_comp_int[aarx],ul_cu[sf].d_ulsch_power[0],ul_cu[sf].d_rxdata_fft[0],first_rb);
break;
case 60:
cufftExecC2C( ul_cu[sf].idft.fft60,
(cufftComplex *) ul_cu[sf].d_rxdata_comp[aarx],
(cufftComplex *) ul_cu[sf].d_rxdata_comp[aarx],
CUFFT_INVERSE);
k_short_60<<< number_symbols, 300, 0, ul_cu[sf].stream_ul >>>( ul_cu[sf].d_rxdata_comp[aarx], (short*)ul_cu[sf].d_rxdata_comp_int[aarx],ul_cu[sf].d_ulsch_power[0],ul_cu[sf].d_rxdata_fft[0],first_rb);
break;
case 72:
cufftExecC2C( ul_cu[sf].idft.fft72,
(cufftComplex *) ul_cu[sf].d_rxdata_comp[aarx],
(cufftComplex *) ul_cu[sf].d_rxdata_comp[aarx],
CUFFT_INVERSE);
k_short_72<<< number_symbols, 300, 0, ul_cu[sf].stream_ul >>>( ul_cu[sf].d_rxdata_comp[aarx], (short*)ul_cu[sf].d_rxdata_comp_int[aarx],ul_cu[sf].d_ulsch_power[0],ul_cu[sf].d_rxdata_fft[0],first_rb);
break;
case 96:
cufftExecC2C( ul_cu[sf].idft.fft96,
(cufftComplex *) ul_cu[sf].d_rxdata_comp[aarx],
(cufftComplex *) ul_cu[sf].d_rxdata_comp[aarx],
CUFFT_INVERSE);
k_short_96<<< number_symbols, 300, 0, ul_cu[sf].stream_ul >>>( ul_cu[sf].d_rxdata_comp[aarx], (short*)ul_cu[sf].d_rxdata_comp_int[aarx],ul_cu[sf].d_ulsch_power[0],ul_cu[sf].d_rxdata_fft[0],first_rb);
break;
case 108:
cufftExecC2C( ul_cu[sf].idft.fft108,
(cufftComplex *) ul_cu[sf].d_rxdata_comp[aarx],
(cufftComplex *) ul_cu[sf].d_rxdata_comp[aarx],
CUFFT_INVERSE);
k_short_108<<< number_symbols, 300, 0, ul_cu[sf].stream_ul >>>( ul_cu[sf].d_rxdata_comp[aarx], (short*)ul_cu[sf].d_rxdata_comp_int[aarx],ul_cu[sf].d_ulsch_power[0],ul_cu[sf].d_rxdata_fft[0],first_rb);
break;
case 120:
cufftExecC2C( ul_cu[sf].idft.fft120,
(cufftComplex *) ul_cu[sf].d_rxdata_comp[aarx],
(cufftComplex *) ul_cu[sf].d_rxdata_comp[aarx],
CUFFT_INVERSE);
k_short_120<<< number_symbols, 300, 0, ul_cu[sf].stream_ul >>>( ul_cu[sf].d_rxdata_comp[aarx], (short*)ul_cu[sf].d_rxdata_comp_int[aarx],ul_cu[sf].d_ulsch_power[0],ul_cu[sf].d_rxdata_fft[0],first_rb);
break;
case 144:
cufftExecC2C( ul_cu[sf].idft.fft144,
(cufftComplex *) ul_cu[sf].d_rxdata_comp[aarx],
(cufftComplex *) ul_cu[sf].d_rxdata_comp[aarx],
CUFFT_INVERSE);
k_short_144<<< number_symbols, 300, 0, ul_cu[sf].stream_ul >>>( ul_cu[sf].d_rxdata_comp[aarx], (short*)ul_cu[sf].d_rxdata_comp_int[aarx],ul_cu[sf].d_ulsch_power[0],ul_cu[sf].d_rxdata_fft[0],first_rb);
break;
case 180:
cufftExecC2C( ul_cu[sf].idft.fft180,
(cufftComplex *) ul_cu[sf].d_rxdata_comp[aarx],
(cufftComplex *) ul_cu[sf].d_rxdata_comp[aarx],
CUFFT_INVERSE);
k_short_180<<< number_symbols, 300, 0, ul_cu[sf].stream_ul >>>( ul_cu[sf].d_rxdata_comp[aarx], (short*)ul_cu[sf].d_rxdata_comp_int[aarx],ul_cu[sf].d_ulsch_power[0],ul_cu[sf].d_rxdata_fft[0],first_rb);
break;
case 192:
cufftExecC2C( ul_cu[sf].idft.fft192,
(cufftComplex *) ul_cu[sf].d_rxdata_comp[aarx],
(cufftComplex *) ul_cu[sf].d_rxdata_comp[aarx],
CUFFT_INVERSE);
k_short_192<<< number_symbols, 300, 0, ul_cu[sf].stream_ul >>>( ul_cu[sf].d_rxdata_comp[aarx], (short*)ul_cu[sf].d_rxdata_comp_int[aarx],ul_cu[sf].d_ulsch_power[0],ul_cu[sf].d_rxdata_fft[0],first_rb);
break;
case 216:
cufftExecC2C( ul_cu[sf].idft.fft216,
(cufftComplex *) ul_cu[sf].d_rxdata_comp[aarx],
(cufftComplex *) ul_cu[sf].d_rxdata_comp[aarx],
CUFFT_INVERSE);
k_short_216<<< number_symbols, 300, 0, ul_cu[sf].stream_ul >>>( ul_cu[sf].d_rxdata_comp[aarx], (short*)ul_cu[sf].d_rxdata_comp_int[aarx],ul_cu[sf].d_ulsch_power[0],ul_cu[sf].d_rxdata_fft[0],first_rb);
break;
case 240:
cufftExecC2C( ul_cu[sf].idft.fft240,
(cufftComplex *) ul_cu[sf].d_rxdata_comp[aarx],
(cufftComplex *) ul_cu[sf].d_rxdata_comp[aarx],
CUFFT_INVERSE);
k_short_240<<< number_symbols, 300, 0, ul_cu[sf].stream_ul >>>( ul_cu[sf].d_rxdata_comp[aarx], (short*)ul_cu[sf].d_rxdata_comp_int[aarx],ul_cu[sf].d_ulsch_power[0],ul_cu[sf].d_rxdata_fft[0],first_rb);
break;
case 288:
cufftExecC2C( ul_cu[sf].idft.fft288,
(cufftComplex *) ul_cu[sf].d_rxdata_comp[aarx],
(cufftComplex *) ul_cu[sf].d_rxdata_comp[aarx],
CUFFT_INVERSE);
k_short_288<<< number_symbols, 300, 0, ul_cu[sf].stream_ul >>>( ul_cu[sf].d_rxdata_comp[aarx], (short*)ul_cu[sf].d_rxdata_comp_int[aarx],ul_cu[sf].d_ulsch_power[0],ul_cu[sf].d_rxdata_fft[0],first_rb);
break;
case 300:
cufftExecC2C( ul_cu[sf].idft.fft300,
(cufftComplex *) ul_cu[sf].d_rxdata_comp[aarx],
(cufftComplex *) ul_cu[sf].d_rxdata_comp[aarx],
CUFFT_INVERSE);
k_short_300<<< number_symbols, 300, 0, ul_cu[sf].stream_ul >>>( ul_cu[sf].d_rxdata_comp[aarx], (short*)ul_cu[sf].d_rxdata_comp_int[aarx],ul_cu[sf].d_ulsch_power[0],ul_cu[sf].d_rxdata_fft[0],first_rb);
break;
}
}
void ulsch_extract_rb_cu( unsigned int first_rb,
unsigned int nb_rb,
unsigned short number_symbols,
unsigned short sf)
{
dim3 thread( nb_rb*12, 2, 1);
dim3 block( number_symbols, 1, 1);
exrb<<< block, thread, 0, ul_cu[sf].stream_ul>>>
( ul_cu[sf].d_rxdata_fft[0],
ul_cu[sf].d_rxdata_ext[0],
(short*)ul_cu[sf].d_rxdata_comp_int[0],
first_rb,
nb_rb*12
);
}
void estimation_cu( unsigned int first_rb,
unsigned int nb_rb,
unsigned short number_symbols,
unsigned short sf)
{
dim3 block( number_symbols, 1, 1 );
dim3 thread( ulsch_para[sf].Msc_RS, 2, 1 );
estimation<<< block, thread, 0, ul_cu[sf].stream_ul>>>
( ul_cu[sf].d_rxdata_ext[0],
esti_const.d_ul_ref_sigs_rx[ulsch_para[sf].u1][ulsch_para[sf].v1][ulsch_para[sf].Msc_RS_idx],
esti_const.d_ul_ref_sigs_rx[ulsch_para[sf].u2][ulsch_para[sf].v2][ulsch_para[sf].Msc_RS_idx],
ul_cu[sf].d_drs_ch[0],
(short*)ul_cu[sf].d_drs_ch_int[0],
ulsch_para[sf].cyclic_shift1,
ulsch_para[sf].cyclic_shift2,
ulsch_para[sf].Msc_RS
);
}
void compensation_cu( unsigned int first_rb,
unsigned int nb_rb,
unsigned short number_symbols,
short Qm,
unsigned short sf)
{
float Qm_const;
dim3 block( number_symbols, 1, 1 );
dim3 thread( ulsch_para[sf].Msc_RS, 2, 1 );
//printf("in compensation\n");
switch(Qm)
{
case 2: Qm_const = 1.0; break;
case 4: Qm_const = 0.632456; break;
}
compensation<<< block, thread, 0, ul_cu[sf].stream_ul>>>
( ul_cu[sf].d_rxdata_ext[0],
(short*)ul_cu[sf].d_rxdata_ext_int[0],
ul_cu[sf].d_drs_ch[0],
(short*)ul_cu[sf].d_drs_ch_int[0],
Qm_const,
ul_cu[sf].d_rxdata_comp[0],
(short*)ul_cu[sf].d_rxdata_comp_int[0],
ul_cu[sf].d_ulsch_power[0],
ulsch_para[sf].Msc_RS
);
}
void idft_cu(unsigned int first_rb,
unsigned int nb_rb,
unsigned short number_symbols,
short cl,
unsigned short sf
)
{
int aarx = 0;
switch ( ulsch_para[sf].Msc_RS )
{
case 12:
cufftExecC2C( ul_cu[sf].idft.fft12,
(cufftComplex *) ul_cu[sf].d_rxdata_comp[aarx],
(cufftComplex *) ul_cu[sf].d_rxdata_comp[aarx],
CUFFT_INVERSE);
k_short_12<<< number_symbols, 300, 0, ul_cu[sf].stream_ul >>>( ul_cu[sf].d_rxdata_comp[aarx], (short*)ul_cu[sf].d_rxdata_comp_int[aarx],ul_cu[sf].d_ulsch_power[0],ul_cu[sf].d_rxdata_fft[0],first_rb);
break;
case 24:
cufftExecC2C( ul_cu[sf].idft.fft24,
(cufftComplex *) ul_cu[sf].d_rxdata_comp[aarx],
(cufftComplex *) ul_cu[sf].d_rxdata_comp[aarx],
CUFFT_INVERSE);
k_short_24<<< number_symbols, 300, 0, ul_cu[sf].stream_ul >>>( ul_cu[sf].d_rxdata_comp[aarx], (short*)ul_cu[sf].d_rxdata_comp_int[aarx],ul_cu[sf].d_ulsch_power[0],ul_cu[sf].d_rxdata_fft[0],first_rb);
break;
case 36:
cufftExecC2C( ul_cu[sf].idft.fft36,
(cufftComplex *) ul_cu[sf].d_rxdata_comp[aarx],
(cufftComplex *) ul_cu[sf].d_rxdata_comp[aarx],
CUFFT_INVERSE);
k_short_36<<< number_symbols, 300, 0, ul_cu[sf].stream_ul >>>( ul_cu[sf].d_rxdata_comp[aarx], (short*)ul_cu[sf].d_rxdata_comp_int[aarx],ul_cu[sf].d_ulsch_power[0],ul_cu[sf].d_rxdata_fft[0],first_rb);
break;
case 48:
cufftExecC2C( ul_cu[sf].idft.fft48,
(cufftComplex *) ul_cu[sf].d_rxdata_comp[aarx],
(cufftComplex *) ul_cu[sf].d_rxdata_comp[aarx],
CUFFT_INVERSE);
k_short_48<<< number_symbols, 300, 0, ul_cu[sf].stream_ul >>>( ul_cu[sf].d_rxdata_comp[aarx], (short*)ul_cu[sf].d_rxdata_comp_int[aarx],ul_cu[sf].d_ulsch_power[0],ul_cu[sf].d_rxdata_fft[0],first_rb);
break;
case 60:
cufftExecC2C( ul_cu[sf].idft.fft60,
(cufftComplex *) ul_cu[sf].d_rxdata_comp[aarx],
(cufftComplex *) ul_cu[sf].d_rxdata_comp[aarx],
CUFFT_INVERSE);
k_short_60<<< number_symbols, 300, 0, ul_cu[sf].stream_ul >>>( ul_cu[sf].d_rxdata_comp[aarx], (short*)ul_cu[sf].d_rxdata_comp_int[aarx],ul_cu[sf].d_ulsch_power[0],ul_cu[sf].d_rxdata_fft[0],first_rb);
break;
case 72:
cufftExecC2C( ul_cu[sf].idft.fft72,
(cufftComplex *) ul_cu[sf].d_rxdata_comp[aarx],
(cufftComplex *) ul_cu[sf].d_rxdata_comp[aarx],
CUFFT_INVERSE);
k_short_72<<< number_symbols, 300, 0, ul_cu[sf].stream_ul >>>( ul_cu[sf].d_rxdata_comp[aarx], (short*)ul_cu[sf].d_rxdata_comp_int[aarx],ul_cu[sf].d_ulsch_power[0],ul_cu[sf].d_rxdata_fft[0],first_rb);
break;
case 96:
cufftExecC2C( ul_cu[sf].idft.fft96,
(cufftComplex *) ul_cu[sf].d_rxdata_comp[aarx],
(cufftComplex *) ul_cu[sf].d_rxdata_comp[aarx],
CUFFT_INVERSE);
k_short_96<<< number_symbols, 300, 0, ul_cu[sf].stream_ul >>>( ul_cu[sf].d_rxdata_comp[aarx], (short*)ul_cu[sf].d_rxdata_comp_int[aarx],ul_cu[sf].d_ulsch_power[0],ul_cu[sf].d_rxdata_fft[0],first_rb);
break;
case 108:
cufftExecC2C( ul_cu[sf].idft.fft108,
(cufftComplex *) ul_cu[sf].d_rxdata_comp[aarx],
(cufftComplex *) ul_cu[sf].d_rxdata_comp[aarx],
CUFFT_INVERSE);
k_short_108<<< number_symbols, 300, 0, ul_cu[sf].stream_ul >>>( ul_cu[sf].d_rxdata_comp[aarx], (short*)ul_cu[sf].d_rxdata_comp_int[aarx],ul_cu[sf].d_ulsch_power[0],ul_cu[sf].d_rxdata_fft[0],first_rb);
break;
case 120:
cufftExecC2C( ul_cu[sf].idft.fft120,
(cufftComplex *) ul_cu[sf].d_rxdata_comp[aarx],
(cufftComplex *) ul_cu[sf].d_rxdata_comp[aarx],
CUFFT_INVERSE);
k_short_120<<< number_symbols, 300, 0, ul_cu[sf].stream_ul >>>( ul_cu[sf].d_rxdata_comp[aarx], (short*)ul_cu[sf].d_rxdata_comp_int[aarx],ul_cu[sf].d_ulsch_power[0],ul_cu[sf].d_rxdata_fft[0],first_rb);
break;
case 144:
cufftExecC2C( ul_cu[sf].idft.fft144,
(cufftComplex *) ul_cu[sf].d_rxdata_comp[aarx],
(cufftComplex *) ul_cu[sf].d_rxdata_comp[aarx],
CUFFT_INVERSE);
k_short_144<<< number_symbols, 300, 0, ul_cu[sf].stream_ul >>>( ul_cu[sf].d_rxdata_comp[aarx], (short*)ul_cu[sf].d_rxdata_comp_int[aarx],ul_cu[sf].d_ulsch_power[0],ul_cu[sf].d_rxdata_fft[0],first_rb);
break;
case 180:
cufftExecC2C( ul_cu[sf].idft.fft180,
(cufftComplex *) ul_cu[sf].d_rxdata_comp[aarx],
(cufftComplex *) ul_cu[sf].d_rxdata_comp[aarx],
CUFFT_INVERSE);
k_short_180<<< number_symbols, 300, 0, ul_cu[sf].stream_ul >>>( ul_cu[sf].d_rxdata_comp[aarx], (short*)ul_cu[sf].d_rxdata_comp_int[aarx],ul_cu[sf].d_ulsch_power[0],ul_cu[sf].d_rxdata_fft[0],first_rb);
break;
case 192:
cufftExecC2C( ul_cu[sf].idft.fft192,
(cufftComplex *) ul_cu[sf].d_rxdata_comp[aarx],
(cufftComplex *) ul_cu[sf].d_rxdata_comp[aarx],
CUFFT_INVERSE);
k_short_192<<< number_symbols, 300, 0, ul_cu[sf].stream_ul >>>( ul_cu[sf].d_rxdata_comp[aarx], (short*)ul_cu[sf].d_rxdata_comp_int[aarx],ul_cu[sf].d_ulsch_power[0],ul_cu[sf].d_rxdata_fft[0],first_rb);
break;
case 216:
cufftExecC2C( ul_cu[sf].idft.fft216,
(cufftComplex *) ul_cu[sf].d_rxdata_comp[aarx],
(cufftComplex *) ul_cu[sf].d_rxdata_comp[aarx],
CUFFT_INVERSE);
k_short_216<<< number_symbols, 300, 0, ul_cu[sf].stream_ul >>>( ul_cu[sf].d_rxdata_comp[aarx], (short*)ul_cu[sf].d_rxdata_comp_int[aarx],ul_cu[sf].d_ulsch_power[0],ul_cu[sf].d_rxdata_fft[0],first_rb);
break;
case 240:
cufftExecC2C( ul_cu[sf].idft.fft240,
(cufftComplex *) ul_cu[sf].d_rxdata_comp[aarx],
(cufftComplex *) ul_cu[sf].d_rxdata_comp[aarx],
CUFFT_INVERSE);
k_short_240<<< number_symbols, 300, 0, ul_cu[sf].stream_ul >>>( ul_cu[sf].d_rxdata_comp[aarx], (short*)ul_cu[sf].d_rxdata_comp_int[aarx],ul_cu[sf].d_ulsch_power[0],ul_cu[sf].d_rxdata_fft[0],first_rb);
break;
case 288:
cufftExecC2C( ul_cu[sf].idft.fft288,
(cufftComplex *) ul_cu[sf].d_rxdata_comp[aarx],
(cufftComplex *) ul_cu[sf].d_rxdata_comp[aarx],
CUFFT_INVERSE);
k_short_288<<< number_symbols, 300, 0, ul_cu[sf].stream_ul >>>( ul_cu[sf].d_rxdata_comp[aarx], (short*)ul_cu[sf].d_rxdata_comp_int[aarx],ul_cu[sf].d_ulsch_power[0],ul_cu[sf].d_rxdata_fft[0],first_rb);
break;
case 300:
cufftExecC2C( ul_cu[sf].idft.fft300,
(cufftComplex *) ul_cu[sf].d_rxdata_comp[aarx],
(cufftComplex *) ul_cu[sf].d_rxdata_comp[aarx],
CUFFT_INVERSE);
k_short_300<<< number_symbols, 300, 0, ul_cu[sf].stream_ul >>>( ul_cu[sf].d_rxdata_comp[aarx], (short*)ul_cu[sf].d_rxdata_comp_int[aarx],ul_cu[sf].d_ulsch_power[0],ul_cu[sf].d_rxdata_fft[0],first_rb);
break;
}
//add_value<<< 1,1,0,ul_cu[sf].stream_ul >>>(ul_cu[sf].d_rxdata_comp_int[aarx],ul_cu[sf].d_ulsch_power[0]);
}
__global__ void k_short_12( float2 *x, short *y, int *sig_eng, float2 *rxF, unsigned int first_rb)
{
const float one_per_sqart12 = 0.28867513459;
int outi= 300 * blockIdx.x+ threadIdx.x;
int ini = 12 * blockIdx.x + threadIdx.x;
const int symbol_id = blockIdx.x;
const int re_id = threadIdx.x;
int pos = (362 + first_rb * 12 + re_id)%512;
pos = ( pos >= 150 && pos < 362)? pos+212: pos;
switch ( symbol_id )
{
case 3:
y[1800+(re_id<<1)] = rxF[ 3584 + pos ].x*0.0441941738;
y[1801+(re_id<<1)] = rxF[ 3584 + pos ].y*0.0441941738;
break;
case 10:
if ( re_id == 0 )
{
y[6000+(re_id<<1)] = sig_eng[0];
y[6001+(re_id<<1)] = sig_eng[1];
}
break;
default:
y[outi<<1] = ( short )(x[ini].x*one_per_sqart12);
y[(outi<<1)+1]=( short )(x[ini].y*one_per_sqart12);
}
}
__global__ void k_short_24( float2 *x, short *y, int *sig_eng, float2 *rxF, unsigned int first_rb)
{
const float one_per_sqart24 = 0.2041241452;
int outi= 300 * blockIdx.x+ threadIdx.x;
int ini = 24 * blockIdx.x + threadIdx.x;
const int symbol_id = blockIdx.x;
const int re_id = threadIdx.x;
int pos = (362 + first_rb * 12 + re_id)%512;
pos = ( pos >= 150 && pos < 362)? pos+212: pos;
switch ( symbol_id )
{
case 3:
y[1800+(re_id<<1)] = rxF[ 3584 + pos ].x*0.0441941738;
y[1801+(re_id<<1)] = rxF[ 3584 + pos ].y*0.0441941738;
break;
case 10:
if ( re_id == 0 )
{
y[6000+(re_id<<1)] = sig_eng[0];
y[6001+(re_id<<1)] = sig_eng[1];
}
break;
default:
y[outi<<1] = ( short )(x[ini].x*one_per_sqart24);
y[(outi<<1)+1]=( short )(x[ini].y*one_per_sqart24);
}
}
__global__ void k_short_36( float2 *x, short *y, int *sig_eng, float2 *rxF, unsigned int first_rb)
{
const float one_per_sqart36 = 0.16666666667;
int outi= 300 * blockIdx.x+ threadIdx.x;
int ini = 36 * blockIdx.x + threadIdx.x;
const int symbol_id = blockIdx.x;
const int re_id = threadIdx.x;
int pos = (362 + first_rb * 12 + re_id)%512;
pos = ( pos >= 150 && pos < 362)? pos+212: pos;
switch ( symbol_id )
{
case 3:
y[1800+(re_id<<1)] = rxF[ 3584 + pos ].x*0.0441941738;
y[1801+(re_id<<1)] = rxF[ 3584 + pos ].y*0.0441941738;
break;
case 10:
if ( re_id == 0 )
{
y[6000+(re_id<<1)] = sig_eng[0];
y[6001+(re_id<<1)] = sig_eng[1];
}
break;
default:
y[outi<<1] = ( short )(x[ini].x*one_per_sqart36);
y[(outi<<1)+1]=( short )(x[ini].y*one_per_sqart36);
}
}
__global__ void k_short_48( float2 *x, short *y, int *sig_eng, float2 *rxF, unsigned int first_rb)
{
const float one_per_sqart48 = 0.144337567297;
int outi= 300 * blockIdx.x+ threadIdx.x;
int ini = 48 * blockIdx.x + threadIdx.x;
const int symbol_id = blockIdx.x;
const int re_id = threadIdx.x;
int pos = (362 + first_rb * 12 + re_id)%512;
pos = ( pos >= 150 && pos < 362)? pos+212: pos;
switch ( symbol_id )
{
case 3:
y[1800+(re_id<<1)] = rxF[ 3584 + pos ].x*0.0441941738;
y[1801+(re_id<<1)] = rxF[ 3584 + pos ].y*0.0441941738;
break;
case 10:
if ( re_id == 0 )
{
y[6000+(re_id<<1)] = sig_eng[0];
y[6001+(re_id<<1)] = sig_eng[1];
}
break;
default:
y[outi<<1] = ( short )(x[ini].x*one_per_sqart48);
y[(outi<<1)+1]=( short )(x[ini].y*one_per_sqart48);
}
}
__global__ void k_short_60( float2 *x, short *y, int *sig_eng, float2 *rxF, unsigned int first_rb)
{
const float one_per_sqart60 = 0.12909944487;
int outi= 300 * blockIdx.x+ threadIdx.x;
int ini = 60 * blockIdx.x + threadIdx.x;
const int symbol_id = blockIdx.x;
const int re_id = threadIdx.x;
int pos = (362 + first_rb * 12 + re_id)%512;
pos = ( pos >= 150 && pos < 362)? pos+212: pos;
switch ( symbol_id )
{
case 3:
y[1800+(re_id<<1)] = rxF[ 3584 + pos ].x*0.0441941738;
y[1801+(re_id<<1)] = rxF[ 3584 + pos ].y*0.0441941738;
break;
case 10:
if ( re_id == 0 )
{
y[6000+(re_id<<1)] = sig_eng[0];
y[6001+(re_id<<1)] = sig_eng[1];
}
break;
default:
y[outi<<1] = ( short )(x[ini].x*one_per_sqart60);
y[(outi<<1)+1]=( short )(x[ini].y*one_per_sqart60);
}
}
__global__ void k_short_72( float2 *x, short *y, int *sig_eng, float2 *rxF, unsigned int first_rb)
{
const float one_per_sqart72 = 0.117851130198;
int outi= 300 * blockIdx.x+ threadIdx.x;
int ini = 72 * blockIdx.x + threadIdx.x;
const int symbol_id = blockIdx.x;
const int re_id = threadIdx.x;
int pos = (362 + first_rb * 12 + re_id)%512;
pos = ( pos >= 150 && pos < 362)? pos+212: pos;
switch ( symbol_id )
{
case 3:
y[1800+(re_id<<1)] = rxF[ 3584 + pos ].x*0.0441941738;
y[1801+(re_id<<1)] = rxF[ 3584 + pos ].y*0.0441941738;
break;
case 10:
if ( re_id == 0 )
{
y[6000+(re_id<<1)] = sig_eng[0];
y[6001+(re_id<<1)] = sig_eng[1];
}
break;
default:
y[outi<<1] = ( short )(x[ini].x*one_per_sqart72);
y[(outi<<1)+1]=( short )(x[ini].y*one_per_sqart72);
}
}
__global__ void k_short_96( float2 *x, short *y, int *sig_eng, float2 *rxF, unsigned int first_rb)
{
const float one_per_sqart96 = 0.102062072616;
int outi= 300 * blockIdx.x+ threadIdx.x;
int ini = 96 * blockIdx.x + threadIdx.x;
const int symbol_id = blockIdx.x;
const int re_id = threadIdx.x;
int pos = (362 + first_rb * 12 + re_id)%512;
pos = ( pos >= 150 && pos < 362)? pos+212: pos;
switch ( symbol_id )
{
case 3:
y[1800+(re_id<<1)] = rxF[ 3584 + pos ].x*0.0441941738;
y[1801+(re_id<<1)] = rxF[ 3584 + pos ].y*0.0441941738;
break;
case 10:
if ( re_id == 0 )
{
y[6000+(re_id<<1)] = sig_eng[0];
y[6001+(re_id<<1)] = sig_eng[1];
}
break;
default:
y[outi<<1] = ( short )(x[ini].x*one_per_sqart96);
y[(outi<<1)+1]=( short )(x[ini].y*one_per_sqart96);
}
}
__global__ void k_short_108( float2 *x, short *y, int *sig_eng, float2 *rxF, unsigned int first_rb)
{
const float one_per_sqart108 = 0.096225044865;
int outi= 300 * blockIdx.x+ threadIdx.x;
int ini = 108 * blockIdx.x + threadIdx.x;
const int symbol_id = blockIdx.x;
const int re_id = threadIdx.x;
int pos = (362 + first_rb * 12 + re_id)%512;
pos = ( pos >= 150 && pos < 362)? pos+212: pos;
switch ( symbol_id )
{
case 3:
y[1800+(re_id<<1)] = rxF[ 3584 + pos ].x*0.0441941738;
y[1801+(re_id<<1)] = rxF[ 3584 + pos ].y*0.0441941738;
break;
case 10:
if ( re_id == 0 )
{
y[6000+(re_id<<1)] = sig_eng[0];
y[6001+(re_id<<1)] = sig_eng[1];
}
break;
default:
y[outi<<1] = ( short )(x[ini].x*one_per_sqart108);
y[(outi<<1)+1]=( short )(x[ini].y*one_per_sqart108);
}
}
__global__ void k_short_120( float2 *x, short *y, int *sig_eng, float2 *rxF, unsigned int first_rb)
{
const float one_per_sqart120 = 0.0912870929175;
int outi= 300 * blockIdx.x+ threadIdx.x;
int ini = 120 * blockIdx.x + threadIdx.x;
const int symbol_id = blockIdx.x;
const int re_id = threadIdx.x;
int pos = (362 + first_rb * 12 + re_id)%512;
pos = ( pos >= 150 && pos < 362)? pos+212: pos;
switch ( symbol_id )
{
case 3:
y[1800+(re_id<<1)] = rxF[ 3584 + pos ].x*0.0441941738;
y[1801+(re_id<<1)] = rxF[ 3584 + pos ].y*0.0441941738;
break;
case 10:
if ( re_id == 0 )
{
y[6000+(re_id<<1)] = sig_eng[0];
y[6001+(re_id<<1)] = sig_eng[1];
}
break;
default:
y[outi<<1] = ( short )(x[ini].x*one_per_sqart120);
y[(outi<<1)+1]=( short )(x[ini].y*one_per_sqart120);
}
}
__global__ void k_short_144( float2 *x, short *y, int *sig_eng, float2 *rxF, unsigned int first_rb)
{
const float one_per_sqart144 = 0.083333333333333;
int outi= 300 * blockIdx.x+ threadIdx.x;
int ini = 144 * blockIdx.x + threadIdx.x;
const int symbol_id = blockIdx.x;
const int re_id = threadIdx.x;
int pos = (362 + first_rb * 12 + re_id)%512;
pos = ( pos >= 150 && pos < 362)? pos+212: pos;
switch ( symbol_id )
{
case 3:
y[1800+(re_id<<1)] = rxF[ 3584 + pos ].x*0.0441941738;
y[1801+(re_id<<1)] = rxF[ 3584 + pos ].y*0.0441941738;
break;
case 10:
if ( re_id == 0 )
{
y[6000+(re_id<<1)] = sig_eng[0];
y[6001+(re_id<<1)] = sig_eng[1];
}
break;
default:
y[outi<<1] = ( short )(x[ini].x*one_per_sqart144);
y[(outi<<1)+1]=( short )(x[ini].y*one_per_sqart144);
}
}
__global__ void k_short_180( float2 *x, short *y, int *sig_eng, float2 *rxF, unsigned int first_rb)
{
const float one_per_sqart180 = 0.07453559925;
int outi= 300 * blockIdx.x+ threadIdx.x;
int ini = 180 * blockIdx.x + threadIdx.x;
const int symbol_id = blockIdx.x;
const int re_id = threadIdx.x;
int pos = (362 + first_rb * 12 + re_id)%512;
pos = ( pos >= 150 && pos < 362)? pos+212: pos;
switch ( symbol_id )
{
case 3:
y[1800+(re_id<<1)] = rxF[ 3584 + pos ].x*0.0441941738;
y[1801+(re_id<<1)] = rxF[ 3584 + pos ].y*0.0441941738;
break;
case 10:
if ( re_id == 0 )
{
y[6000+(re_id<<1)] = sig_eng[0];
y[6001+(re_id<<1)] = sig_eng[1];
}
break;
default:
y[outi<<1] = ( short )(x[ini].x*one_per_sqart180);
y[(outi<<1)+1]=( short )(x[ini].y*one_per_sqart180);
}
}
__global__ void k_short_192( float2 *x, short *y, int *sig_eng, float2 *rxF, unsigned int first_rb)
{
const float one_per_sqart192 = 0.072168783649;
int outi= 300 * blockIdx.x+ threadIdx.x;
int ini = 192 * blockIdx.x + threadIdx.x;
const int symbol_id = blockIdx.x;
const int re_id = threadIdx.x;
int pos = (362 + first_rb * 12 + re_id)%512;
pos = ( pos >= 150 && pos < 362)? pos+212: pos;
switch ( symbol_id )
{
case 3:
y[1800+(re_id<<1)] = rxF[ 3584 + pos ].x*0.0441941738;
y[1801+(re_id<<1)] = rxF[ 3584 + pos ].y*0.0441941738;
break;
case 10:
if ( re_id == 0 )
{
y[6000+(re_id<<1)] = sig_eng[0];
y[6001+(re_id<<1)] = sig_eng[1];
}
break;
default:
y[outi<<1] = ( short )(x[ini].x*one_per_sqart192);
y[(outi<<1)+1]=( short )(x[ini].y*one_per_sqart192);
}
}
__global__ void k_short_216( float2 *x, short *y, int *sig_eng, float2 *rxF, unsigned int first_rb)
{
const float one_per_sqart216 = 0.068041381744;
int outi= 300 * blockIdx.x+ threadIdx.x;
int ini = 216 * blockIdx.x + threadIdx.x;
const int symbol_id = blockIdx.x;
const int re_id = threadIdx.x;
int pos = (362 + first_rb * 12 + re_id)%512;
pos = ( pos >= 150 && pos < 362)? pos+212: pos;
switch ( symbol_id )
{
case 3:
y[1800+(re_id<<1)] = rxF[ 3584 + pos ].x*0.0441941738;
y[1801+(re_id<<1)] = rxF[ 3584 + pos ].y*0.0441941738;
break;
case 10:
if ( re_id == 0 )
{
y[6000+(re_id<<1)] = sig_eng[0];
y[6001+(re_id<<1)] = sig_eng[1];
}
break;
default:
y[outi<<1] = ( short )(x[ini].x*one_per_sqart216);
y[(outi<<1)+1]=( short )(x[ini].y*one_per_sqart216);
}
}
__global__ void k_short_240( float2 *x, short *y, int *sig_eng, float2 *rxF, unsigned int first_rb)
{
const float one_per_sqart240 = 0.0645497224368;
int outi= 300 * blockIdx.x+ threadIdx.x;
int ini = 240 * blockIdx.x + threadIdx.x;
const int symbol_id = blockIdx.x;
const int re_id = threadIdx.x;
int pos = (362 + first_rb * 12 + re_id)%512;
pos = ( pos >= 150 && pos < 362)? pos+212: pos;
switch ( symbol_id )
{
case 3:
y[1800+(re_id<<1)] = rxF[ 3584 + pos ].x*0.0441941738;
y[1801+(re_id<<1)] = rxF[ 3584 + pos ].y*0.0441941738;
break;
case 10:
if ( re_id == 0 )
{
y[6000+(re_id<<1)] = sig_eng[0];
y[6001+(re_id<<1)] = sig_eng[1];
}
break;
default:
y[outi<<1] = ( short )(x[ini].x*one_per_sqart240);
y[(outi<<1)+1]=( short )(x[ini].y*one_per_sqart240);
}
}
__global__ void k_short_288( float2 *x, short *y, int *sig_eng, float2 *rxF, unsigned int first_rb)
{
const float one_per_sqart288 = 0.0589255651;
int outi= 300 * blockIdx.x+ threadIdx.x;
int ini = 288 * blockIdx.x + threadIdx.x;
const int symbol_id = blockIdx.x;
const int re_id = threadIdx.x;
int pos = (362 + first_rb * 12 + re_id)%512;
pos = ( pos >= 150 && pos < 362)? pos+212: pos;
switch ( symbol_id )
{
case 3:
y[1800+(re_id<<1)] = rxF[ 3584 + pos ].x*0.0441941738;
y[1801+(re_id<<1)] = rxF[ 3584 + pos ].y*0.0441941738;
break;
case 10:
if ( re_id == 0 )
{
y[6000+(re_id<<1)] = sig_eng[0];
y[6001+(re_id<<1)] = sig_eng[1];
}
break;
default:
y[outi<<1] = ( short )(x[ini].x*one_per_sqart288);
y[(outi<<1)+1]=( short )(x[ini].y*one_per_sqart288);
}
}
__global__ void k_short_300( float2 *x, short *y, int *sig_eng, float2 *rxF, unsigned int first_rb)
{
const float one_per_sqart300 = 0.057735026919;
const int outi= 300 * blockIdx.x + threadIdx.x;
const int ini = 300 * blockIdx.x + threadIdx.x;
const int symbol_id = blockIdx.x;
const int re_id = threadIdx.x;
int pos = (362 + first_rb * 12 + re_id)%512;
pos = ( pos >= 150 && pos < 362)? pos+212: pos;
switch ( symbol_id )
{
case 3:
y[1800+(re_id<<1)] = rxF[ 3584 + pos ].x*0.0441941738;
y[1801+(re_id<<1)] = rxF[ 3584 + pos ].y*0.0441941738;
break;
case 10:
if ( re_id == 0 )
{
y[6000] = sig_eng[0];
y[6001] = sig_eng[1];
}
break;
default:
y[outi<<1] = ( short )(x[ini].x*one_per_sqart300);
y[(outi<<1)+1]=( short )(x[ini].y*one_per_sqart300);
}
}
#ifndef __DEFS_CU__H__
#define __DEFS_CU__H__
#include <stdint.h>
#include <stdio.h>
#ifndef CUFFT_H
#define CUFFT_H
#include "cufft.h"
#endif
//typedef float2 Complex;
#ifdef __cplusplus
extern "C"
#endif
void idft512ad_cu( short *, short *, int );
#ifdef __cplusplus
extern "C"
#endif
void dft512rm_cu( short *, short *, int );
#endif
#include "stdio.h"
#include "cufft.h"
#include "defs.h"
#include "PHY/CUDA/extern.h"
typedef float2 Complex;
//for dftXXXrm
__global__ void k_rmcp(int16_t *x, Complex *y, int CP, int CP0)
{
int i= blockDim.x * blockIdx.x+ threadIdx.x ;
int j= (blockDim.x+CP )* blockIdx.x+ threadIdx.x + CP0;
if (blockIdx.x > 6)
j = j + CP0-CP;
y[i].x = ( float )x[(j<<1)];
y[i].y = ( float )x[(j<<1)+1];
}
//for dftXXXrm
__global__ void k_short(Complex *x, short *y)
{
int i= blockDim.x * blockIdx.x+ threadIdx.x;
y[i<<1] = ( short )(x[i].x*0.04419417);//for divide sqrt(512)
y[(i<<1)+1]=( short )(x[i].y*0.04419417);
}
__global__ void k_adcp_extend( short *x, Complex *y )
{
int i= blockDim.x * blockIdx.x+ threadIdx.x;
y[i].x = ( float )x[ (i<<1) ];
y[i].y = ( float )x[ (i<<1)+ 1 ];
}
__global__ void k_test( Complex *x )
{
int tid = threadIdx.x;
int bid = blockIdx.x;
x[bid*blockDim.x+tid].x= tid*22;
x[bid*blockDim.x+tid].y= bid*22;
}
void idft512ad_cu( int16_t *x, int16_t *y, int sf )
{//dl_cu
int i;
// printf("[CUDA] IN idft, sf num = %2d\n",sf);
cudaMemcpyAsync( dl_cu[sf].d_txdata,
x,
sizeof(short)* 2 * dl_cu[sf].ifftsize* dl_cu[sf].symbols_per_tti,
cudaMemcpyHostToDevice,
dl_cu[sf].stream_dl );
k_adcp_extend<<< dl_cu[sf].symbols_per_tti, dl_cu[sf].ifftsize, 0, dl_cu[sf].stream_dl >>>
( dl_cu[sf].d_txdata,
dl_cu[sf].d_txdata_ifft );
cufftExecC2C( dl_cu[sf].ifft,
(cufftComplex *) dl_cu[sf].d_txdata_ifft,
(cufftComplex *) dl_cu[sf].d_txdata_ifft,
CUFFT_INVERSE);
k_short<<< dl_cu[sf].symbols_per_tti, dl_cu[sf].ifftsize, 0, dl_cu[sf].stream_dl >>>
( dl_cu[sf].d_txdata_ifft,
dl_cu[sf].d_txdata );
cudaMemcpyAsync( dl_cu[sf].h_txdata,
dl_cu[sf].d_txdata,
sizeof( short )* 2 * dl_cu[sf].ifftsize* dl_cu[sf].symbols_per_tti,
cudaMemcpyDeviceToHost,
dl_cu[sf].stream_dl);
int index = 0;
short *temp = dl_cu[sf].h_txdata;
cudaStreamSynchronize( dl_cu[sf].stream_dl );
for ( i = 0; i < dl_cu[sf].symbols_per_tti; i++ )
{
int cp = 0;
if( i == 0 || i == 7 )
cp = dl_cu[sf].CP0;
else
cp = dl_cu[sf].CP;
memcpy( &y[ index<<1 ], &temp[ (i+1)*dl_cu[sf].ifftsize*2-cp*2 ], cp*sizeof(short)*2 );
memcpy( &y[ (index+cp)<<1 ], &temp[ i*dl_cu[sf].ifftsize*2 ], dl_cu[sf].ifftsize*2*sizeof(short) );
index = index + cp + dl_cu[sf].ifftsize;
}
}
void dft512rm_cu( int16_t *x, int16_t *y, int sf )
{
//printf("enter DFT\n");
cudaEvent_t startEvent, stopEvent;
cudaEventCreate(&startEvent);
cudaEventCreate(&stopEvent);
cudaEventRecord(startEvent, 0);
cudaMemcpyAsync(ul_cu[sf].d_rxdata[0],
x,
sizeof(short)* ul_cu[sf].samples_per_tti*2,
cudaMemcpyHostToDevice,
ul_cu[sf].stream_ul );
k_rmcp<<< ul_cu[sf].symbols_per_tti,
ul_cu[sf].fftsize,
0,
ul_cu[sf].stream_ul>>>
((short*)ul_cu[sf].d_rxdata[0],
ul_cu[sf].d_rxdata_fft[0],
36,
40);
cufftExecC2C(ul_cu[sf].fft,
(cufftComplex *)ul_cu[sf].d_rxdata_fft[0],
(cufftComplex *)ul_cu[sf].d_rxdata_fft[0],
CUFFT_FORWARD);
k_short<<< ul_cu[sf].symbols_per_tti,
ul_cu[sf].fftsize,
0,
ul_cu[sf].stream_ul>>>
( ul_cu[sf].d_rxdata_fft[0],
(short *)ul_cu[sf].d_rxdataF[0]);
cudaStreamSynchronize( ul_cu[sf].stream_ul);
cudaMemcpyAsync(y,
ul_cu[sf].d_rxdataF[0],
sizeof(short)* ul_cu[sf].symbols_per_tti* 2* ul_cu[sf].fftsize,
cudaMemcpyDeviceToHost,
ul_cu[sf].stream_ul );
cudaStreamSynchronize( ul_cu[sf].stream_ul);
float time;
cudaEventRecord(stopEvent, 0);
cudaEventSynchronize(stopEvent);
cudaEventElapsedTime(&time, startEvent, stopEvent);
//printf("[GPU] end of DFT %f\n",time);
cudaEventDestroy(startEvent);
cudaEventDestroy(stopEvent);
}
#include "PHY/defs.h"
#include "PHY/extern.h"
#include "PHY/CUDA/LTE_TRANSPORT/defs.h"
void rx_ulsch_cu(PHY_VARS_eNB *phy_vars_eNB,
uint32_t sched_subframe,
uint8_t eNB_id, // this is the effective sector id
uint8_t UE_id,
LTE_eNB_ULSCH_t **ulsch,
uint8_t cooperation_flag);
\ No newline at end of file
#include <stdint.h>
#include <stdio.h>
#ifndef CUFFT_H
#define CUFFT_H
#include "cufft.h"
#endif
typedef struct {
float2 *d_ul_ref_sigs_rx[30][2][33];
} estimation_const_t;
typedef struct {
short u1;
short v1;
short u2;
short v2;
short Msc_RS_idx;
short cyclic_shift1;
short cyclic_shift2;
short Msc_RS;
} para_ulsch;
typedef struct {
unsigned int first_rb;
unsigned short first_carrier_offset;
short N_RB_UL;
unsigned short nb_rb1;
unsigned short nb_rb2;
short fftsize;
} ext_rbs;
typedef struct {
cufftHandle fft12;
cufftHandle fft24;
cufftHandle fft36;
cufftHandle fft48;
cufftHandle fft60;
cufftHandle fft72;
cufftHandle fft84;
cufftHandle fft96;
cufftHandle fft108;
cufftHandle fft120;
cufftHandle fft132;
cufftHandle fft144;
cufftHandle fft156;
cufftHandle fft168;
cufftHandle fft180;
cufftHandle fft192;
cufftHandle fft204;
cufftHandle fft216;
cufftHandle fft228;
cufftHandle fft240;
cufftHandle fft252;
cufftHandle fft264;
cufftHandle fft276;
cufftHandle fft288;
cufftHandle fft300;
} fftHandle;
typedef struct {
cudaStream_t stream_ul;
cudaStream_t timing_advance;
cudaStream_t tempstrm;
cufftHandle fft;
cufftHandle ifft_timing_advance;
fftHandle idft;
int **d_rxdata;
float2 **d_rxdata_fft;
int **d_rxdataF;
int **d_rxdata_comp_int;
float2 **d_rxdata_comp;
float2 **d_drs_ch;
int **d_drs_ch_int;
int **d_ulsch_power;
float2 **d_rxdata_ext;
int **d_rxdata_ext_int;
short N_RB_UL;
short nb_antennas_rx;
short symbols_per_tti;
short samples_per_tti;
short Ncp;
short fftsize;
short CP;
short CP0;
} ul_cu_t;
typedef struct {
cudaStream_t stream_dl;
cufftHandle ifft;
short *d_txdata;
short *d_txdata_o;
float2 *d_txdata_ifft;
short *h_txdata;
short symbols_per_tti;
short samples_per_tti;
short Ncp;
short ifftsize;
short CP;
short CP0;
} dl_cu_t;
\ No newline at end of file
#ifndef __MODULATION_EXTERN_CU_H__
#define __MODULATION_EXTERN_CU_H_
#include "defs_struct.h"
#include <stdint.h>
#include <stdio.h>
#ifndef CUFFT_H
#define CUFFT_H
#include "cufft.h"
#endif
extern dl_cu_t dl_cu[10];
extern ul_cu_t ul_cu[10];
extern estimation_const_t esti_const;
extern int device_count;
extern para_ulsch ulsch_para[10];
extern ext_rbs ext_rbs_para[10];
#endif
Markdown is supported
0%
or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment