Commit d243b72e authored by Luis Ariza's avatar Luis Ariza

Starting Ziggurat version SSE2 coding

parent cd07e9eb
...@@ -1155,31 +1155,7 @@ int phy_init_lte_ue(PHY_VARS_UE *ue, ...@@ -1155,31 +1155,7 @@ int phy_init_lte_ue(PHY_VARS_UE *ue,
for (i=0; i<fp->nb_antennas_rx; i++) { for (i=0; i<fp->nb_antennas_rx; i++) {
common_vars->rxdata[i] = (int32_t*) malloc16_clear( (fp->samples_per_tti*10+2048)*sizeof(int32_t) ); common_vars->rxdata[i] = (int32_t*) malloc16_clear( (fp->samples_per_tti*10+2048)*sizeof(int32_t) );
} }
/*if (do_ofdm_mod)
{
common_vars->common_vars_rx_data_per_thread[0].rxdataF = (int32_t**)malloc16( fp->nb_antennas_rx*sizeof(int32_t*));
printf("[lte_init_f] address of rxdataF in memory: %p, thread %d\n",&common_vars->common_vars_rx_data_per_thread[0].rxdataF,0);
for (i=0; i<fp->nb_antennas_rx; i++) {
common_vars->common_vars_rx_data_per_thread[0].rxdataF[i] = (int32_t*)malloc16_clear((10*fp->ofdm_symbol_size*fp->symbols_per_tti)*sizeof(int32_t));
printf("[lte_init_f] address of rxdataF[i] in memory: %p, thread %d, antenna %d\n",&common_vars->common_vars_rx_data_per_thread[0].rxdataF[i],0,i);
}
//rxdata_temp. Please remove this dummy allocation when all arrays are size fixed. If not, the multichannel does not work.
rxdataF_temp = (int32_t**)malloc16( fp->nb_antennas_rx*sizeof(int32_t*) );
for (i=0; i<fp->nb_antennas_rx; i++) {
rxdataF_temp[i] = (int32_t*)malloc16_clear((14313)*sizeof(int32_t));
}
common_vars->common_vars_rx_data_per_thread[1].rxdataF = (int32_t**)malloc16( fp->nb_antennas_rx*sizeof(int32_t*) );
printf("[lte_init_f] address of rxdataF in memory: %p, thread %d\n",&common_vars->common_vars_rx_data_per_thread[1].rxdataF,1);
for (i=0; i<fp->nb_antennas_rx; i++) {
common_vars->common_vars_rx_data_per_thread[1].rxdataF[i] = (int32_t*)malloc16_clear((10*fp->ofdm_symbol_size*fp->symbols_per_tti)*sizeof(int32_t));
printf("[lte_init_f address of rxdataF[i] in memory: %p, thread %d, antenna %d\n",&common_vars->common_vars_rx_data_per_thread[1].rxdataF[i],1,i);
}
}
else
//{*/
for (th_id=0; th_id<RX_NB_TH_MAX; th_id++){ for (th_id=0; th_id<RX_NB_TH_MAX; th_id++){
common_vars->common_vars_rx_data_per_thread[th_id].rxdataF = (int32_t**)malloc16( fp->nb_antennas_rx*sizeof(int32_t*) ); common_vars->common_vars_rx_data_per_thread[th_id].rxdataF = (int32_t**)malloc16( fp->nb_antennas_rx*sizeof(int32_t*) );
printf("[lte_init_t] address of rxdataF in memory: %p, thread %d\n",&common_vars->common_vars_rx_data_per_thread[th_id].rxdataF,th_id); printf("[lte_init_t] address of rxdataF in memory: %p, thread %d\n",&common_vars->common_vars_rx_data_per_thread[th_id].rxdataF,th_id);
...@@ -1208,19 +1184,6 @@ int phy_init_lte_ue(PHY_VARS_UE *ue, ...@@ -1208,19 +1184,6 @@ int phy_init_lte_ue(PHY_VARS_UE *ue,
} }
} }
} }
/*for (i=0; i<fp->ofdm_symbol_size*fp->symbols_per_tti; i++) {
for (int aa=0; aa<fp->nb_antennas_rx; aa++) {
((short *)common_vars->common_vars_rx_data_per_thread[1].rxdataF[aa])[((i+fp->ofdm_symbol_size*fp->symbols_per_tti)<<1)] = (short)(i);
((short *)common_vars->common_vars_rx_data_per_thread[1].rxdataF[aa])[1+((i+fp->ofdm_symbol_size*fp->symbols_per_tti)<<1)] = (short)(1);
if (i < 300) {
printf("[lte-init] rxdataF (thread[%d]) %d: (%d,%d), memory addr %p\n",1,i,((short *)common_vars->common_vars_rx_data_per_thread[1].rxdataF[aa])[((i+fp->ofdm_symbol_size*fp->symbols_per_tti)<<1)],((short *)common_vars->common_vars_rx_data_per_thread[1].rxdataF[aa])[1+((i+fp->ofdm_symbol_size*fp->symbols_per_tti)<<1)],&common_vars->common_vars_rx_data_per_thread[1].rxdataF[aa][1+((i+fp->ofdm_symbol_size*fp->symbols_per_tti)<<1)]);
printf("[lte-init] rxdataF (thread[%d]) %d: (%d,%d), memory addr %p\n",0,i,((short *)common_vars->common_vars_rx_data_per_thread[0].rxdataF[aa])[((i+4+2*fp->ofdm_symbol_size*fp->symbols_per_tti)<<1)],((short *)common_vars->common_vars_rx_data_per_thread[0].rxdataF[aa])[1+((i+4+2*fp->ofdm_symbol_size*fp->symbols_per_tti)<<1)],&common_vars->common_vars_rx_data_per_thread[0].rxdataF[aa][1+((i+4+2*fp->ofdm_symbol_size*fp->symbols_per_tti)<<1)]);
}
}
}*/
} }
// DLSCH // DLSCH
......
...@@ -17,6 +17,12 @@ ...@@ -17,6 +17,12 @@
*------------------------------------------------------------------------------- *-------------------------------------------------------------------------------
* For more information about the OpenAirInterface (OAI) Software Alliance: * For more information about the OpenAirInterface (OAI) Software Alliance:
* contact@openairinterface.org * contact@openairinterface.org
*-------------------------------------------------------------------------------
* Optimization using SIMD instructions
* Frecuency Domain Analysis
* Luis Felipe Ariza Vesga, email:lfarizav@unal.edu.co
* Functions: adc_SSE_float(), adc_prach(), adc_prach_SSE_float().
*-------------------------------------------------------------------------------
*/ */
#include <string.h> #include <string.h>
#include <stdio.h> #include <stdio.h>
...@@ -154,59 +160,7 @@ void adc_freq(double *r_re[2], ...@@ -154,59 +160,7 @@ void adc_freq(double *r_re[2],
//printf("Adc outputs %d %e %d \n",i,((short *)output[0])[((i+output_offset)<<1)], ((i+output_offset)<<1) ); //printf("Adc outputs %d %e %d \n",i,((short *)output[0])[((i+output_offset)<<1)], ((i+output_offset)<<1) );
} }
} }
/*void adc_freq(double *r_re[2],
double *r_im[2],
unsigned int input_offset,
unsigned int output_offset,
unsigned int **output1,//thread th_id
unsigned int **output2,//thread 0
unsigned int **output3,//thread 1
unsigned int nb_rx_antennas,
unsigned int length,
unsigned char B,
int thread)
{
int i;
//int th_id;
int aa;
double gain = (double)(1<<(B-1));*/
/*int dummy_rx[nb_rx_antennas][length] __attribute__((aligned(32)));
for (aa=0; aa<nb_rx_antennas; aa++) {
memset (&output1[aa][output_offset],0,length*sizeof(int));
}*/
//double gain = 1.0;
/*for (i=0; i<length; i++) {
for (aa=0; aa<nb_rx_antennas; aa++) {
((short *)output1[aa])[((i+output_offset)<<1)] = (short)(r_re[aa][i+input_offset]*gain);
((short *)output1[aa])[1+((i+output_offset)<<1)] = (short)(r_im[aa][i+input_offset]*gain);
if ((r_re[aa][i+input_offset]*gain) > 30000) {
//("Adc outputs %d %e %d \n",i,((short *)output[0])[((i+output_offset)<<1)], ((i+output_offset)<<1) );
}
if (i < 300) {
printf("rxdataF (thread[%d]) %d: (%d,%d)\n",thread,i,((short *)output1[aa])[((i+output_offset)<<1)],((short *)output1[aa])[1+((i+output_offset)<<1)]);
if (thread==0 && output_offset>length)
printf("rxdataF (thread[1]) %d: (%d,%d) \n",i,((short *)output3[aa])[((i+output_offset-length-4)<<1)],((short *)output3[aa])[1+((i+output_offset-length-4)<<1)]);
else if (thread==1)
printf("rxdataF (thread[0]) %d: (%d,%d) \n",i,((short *)output2[aa])[((i+output_offset+length+4)<<1)],((short *)output2[aa])[1+((i+output_offset+length+4)<<1)]);
}
}*/
/*for (aa=0; aa<nb_rx_antennas; aa++) {
for (th_id=1; th_id<2; th_id++)
{
memcpy((void *)output[aa][output_offset],
(void *)output[aa][output_offset],
length*sizeof(int));
}
}*/
/*}
printf("thread %d\n",(unsigned int)thread);
//write_output("adc_rxsigF_frame0.m","adc_rxsF0", output1[0],10*length,1,16);
//write_output("adc_rxsigF_frame1.m","adc_rxsF1", output2[0],10*length,1,16);
}*/
void adc_prach(double *r_re[2], void adc_prach(double *r_re[2],
double *r_im[2], double *r_im[2],
unsigned int input_offset, unsigned int input_offset,
...@@ -251,23 +205,82 @@ void adc_prach_SSE_float(float *r_re[2], ...@@ -251,23 +205,82 @@ void adc_prach_SSE_float(float *r_re[2],
int i; int i;
int aa; int aa;
double gain = (double)(1<<(B-1)); __m128 r_re128,r_im128,gain128;
__m128i r_re128i, r_im128i,output128;
float gain = (double)(1<<(B-1));
gain128=_mm_set1_ps(gain);
//double gain = 1.0; //double gain = 1.0;
for (i=0; i<length; i++) { for (i=0; i<(length>>2); i++) {
for (aa=0; aa<nb_rx_antennas; aa++) { for (aa=0; aa<nb_rx_antennas; aa++) {
((short *)output[aa])[((i+output_offset/2)<<1)] = (short)(r_re[aa][i+input_offset]*gain); //((short *)output[aa])[((i+output_offset/2)<<1)] = (short)(r_re[aa][i+input_offset]*gain);
((short *)output[aa])[1+((i+output_offset/2)<<1)] = (short)(r_im[aa][i+input_offset]*gain); //((short *)output[aa])[1+((i+output_offset/2)<<1)] = (short)(r_im[aa][i+input_offset]*gain);
#ifdef DEBUG_ADC r_re128=_mm_loadu_ps(&r_re[aa][4*i+input_offset]);
if (i<10) r_im128=_mm_loadu_ps(&r_im[aa][4*i+input_offset]);
printf("[adc_prach]i %d. input (%d,%d), output (%d,%d)\n",i,(short)(r_re[aa][i+input_offset]),(short)(r_im[aa][i+input_offset]),((short *)output[aa])[((i+output_offset/2)<<1)],((short *)output[aa])[1+((i+output_offset/2)<<1)]); r_re128=_mm_mul_ps(r_re128,gain128);
if (i>length-10&&i<length) r_im128=_mm_mul_ps(r_im128,gain128);
#endif r_re128i=_mm_cvtps_epi32(r_re128);
if ((r_re[aa][i+input_offset]*gain) > 30000) { r_im128i=_mm_cvtps_epi32(r_im128);
//("Adc outputs %d %e %d \n",i,((short *)output[0])[((i+output_offset)<<1)], ((i+output_offset)<<1) ); r_re128i=_mm_packs_epi32(r_re128i,r_re128i);
} r_im128i=_mm_packs_epi32(r_im128i,r_im128i);
output128=_mm_unpacklo_epi16(r_re128i,r_im128i);
_mm_storeu_si128((__m128i *)&output[aa][4*i+output_offset/2],output128);
} }
//printf("Adc outputs %d %e %d \n",i,((short *)output[0])[((i+output_offset)<<1)], ((i+output_offset)<<1) ); //printf("Adc outputs %d %e %d \n",i,((short *)output[0])[((i+output_offset)<<1)], ((i+output_offset)<<1) );
} }
} }
/*void adc_freq(double *r_re[2],
double *r_im[2],
unsigned int input_offset,
unsigned int output_offset,
unsigned int **output1,//thread th_id
unsigned int **output2,//thread 0
unsigned int **output3,//thread 1
unsigned int nb_rx_antennas,
unsigned int length,
unsigned char B,
int thread)
{
int i;
//int th_id;
int aa;
double gain = (double)(1<<(B-1));*/
/*int dummy_rx[nb_rx_antennas][length] __attribute__((aligned(32)));
for (aa=0; aa<nb_rx_antennas; aa++) {
memset (&output1[aa][output_offset],0,length*sizeof(int));
}*/
//double gain = 1.0;
/*for (i=0; i<length; i++) {
for (aa=0; aa<nb_rx_antennas; aa++) {
((short *)output1[aa])[((i+output_offset)<<1)] = (short)(r_re[aa][i+input_offset]*gain);
((short *)output1[aa])[1+((i+output_offset)<<1)] = (short)(r_im[aa][i+input_offset]*gain);
if ((r_re[aa][i+input_offset]*gain) > 30000) {
//("Adc outputs %d %e %d \n",i,((short *)output[0])[((i+output_offset)<<1)], ((i+output_offset)<<1) );
}
if (i < 300) {
printf("rxdataF (thread[%d]) %d: (%d,%d)\n",thread,i,((short *)output1[aa])[((i+output_offset)<<1)],((short *)output1[aa])[1+((i+output_offset)<<1)]);
if (thread==0 && output_offset>length)
printf("rxdataF (thread[1]) %d: (%d,%d) \n",i,((short *)output3[aa])[((i+output_offset-length-4)<<1)],((short *)output3[aa])[1+((i+output_offset-length-4)<<1)]);
else if (thread==1)
printf("rxdataF (thread[0]) %d: (%d,%d) \n",i,((short *)output2[aa])[((i+output_offset+length+4)<<1)],((short *)output2[aa])[1+((i+output_offset+length+4)<<1)]);
}
}*/
/*for (aa=0; aa<nb_rx_antennas; aa++) {
for (th_id=1; th_id<2; th_id++)
{
memcpy((void *)output[aa][output_offset],
(void *)output[aa][output_offset],
length*sizeof(int));
}
}*/
/*}
printf("thread %d\n",(unsigned int)thread);
//write_output("adc_rxsigF_frame0.m","adc_rxsF0", output1[0],10*length,1,16);
//write_output("adc_rxsigF_frame1.m","adc_rxsF1", output2[0],10*length,1,16);
}*/
...@@ -17,6 +17,12 @@ ...@@ -17,6 +17,12 @@
*------------------------------------------------------------------------------- *-------------------------------------------------------------------------------
* For more information about the OpenAirInterface (OAI) Software Alliance: * For more information about the OpenAirInterface (OAI) Software Alliance:
* contact@openairinterface.org * contact@openairinterface.org
*-------------------------------------------------------------------------------
* Optimization using SIMD instructions
* Frecuency Domain Analysis
* Luis Felipe Ariza Vesga, email:lfarizav@unal.edu.co
* Functions: dac_fixed_gain_SSE_float(), dac_fixed_gain_prach(), dac_fixed_gain_prach_SSE_float().
*-------------------------------------------------------------------------------
*/ */
//#define DEBUG_DAC //#define DEBUG_DAC
...@@ -297,8 +303,8 @@ float dac_fixed_gain_prach_SSE_float(float *s_re[2], ...@@ -297,8 +303,8 @@ float dac_fixed_gain_prach_SSE_float(float *s_re[2],
int i; int i;
int aa; int aa;
float amp,amp1; float amp,amp1,div;
int k; __m128 input_re128, input_im128;
amp = //sqrt(NB_RE)*pow(10.0,.05*txpwr_dBm)/sqrt(nb_tx_antennas); //this is amp per tx antenna amp = //sqrt(NB_RE)*pow(10.0,.05*txpwr_dBm)/sqrt(nb_tx_antennas); //this is amp per tx antenna
pow(10.0,.05*txpwr_dBm)/sqrt(nb_tx_antennas); //this is amp per tx antenna pow(10.0,.05*txpwr_dBm)/sqrt(nb_tx_antennas); //this is amp per tx antenna
...@@ -322,27 +328,26 @@ float dac_fixed_gain_prach_SSE_float(float *s_re[2], ...@@ -322,27 +328,26 @@ float dac_fixed_gain_prach_SSE_float(float *s_re[2],
amp1 = amp1*sqrt(512.0/300.0); //account for loss due to null carriers amp1 = amp1*sqrt(512.0/300.0); //account for loss due to null carriers
//printf("DL: amp1 %f dB (%d,%d), tx_power %f\n",20*log10(amp1),input_offset,input_offset_meas,txpwr_dBm); //printf("DL: amp1 %f dB (%d,%d), tx_power %f\n",20*log10(amp1),input_offset,input_offset_meas,txpwr_dBm);
*/ */
#ifdef DEBUG_DAC
printf("DAC: input_offset %d, amp %e, amp1 %e\n",input_offset,amp,amp1); div=amp/amp1;
#endif for (i=0; i<(length>>2); i++) {
k=input_offset;
for (i=0; i<length*2; i+=2) {
for (aa=0; aa<nb_tx_antennas; aa++) { for (aa=0; aa<nb_tx_antennas; aa++) {
s_re[aa][i/2] = amp*((float)(((short *)input))[((k))])/amp1; ///(1<<(B-1)); //s_re[aa][i] = div*((float)(((short *)input))[((input_offset+2*i))]); ///(1<<(B-1));
s_im[aa][i/2] = amp*((float)(((short *)input))[((k))+1])/amp1; ///(1<<(B-1)); //s_im[aa][i] = div*((float)(((short *)input))[((input_offset+2*i))+1]); ///(1<<(B-1));
#ifdef DEBUG_DAC
if(i<20) input_re128=_mm_set_ps((float)(((short *)input))[2*(4*i+3)+input_offset],(float)(((short *)input))[2*(4*i+2)+input_offset],(float)(((short *)input))[2*(4*i+1)+input_offset],(float)(((short *)input))[2*(4*i)+input_offset]);
printf("DAC[%d]: input (%d,%d). output (%e,%e)\n",i/2,(((short *)input))[((k))],(((short *)input))[((k))+1],s_re[aa][i/2],s_im[aa][i/2]); input_im128=_mm_set_ps((float)(((short *)input))[2*(4*i+3)+1+input_offset],(float)(((short *)input))[2*(4*i+2)+1+input_offset],(float)(((short *)input))[2*(4*i+1)+1+input_offset],(float)(((short *)input))[2*(4*i)+1+input_offset]);
if (i>length*2-20&&i<length*2) input_re128=_mm_mul_ps(input_re128,_mm_set1_ps(div));
printf("DAC[%d]: input (%d,%d). output (%e,%e)\n",i/2,(((short *)input))[((k))],(((short *)input))[((k))+1],s_re[aa][i/2],s_im[aa][i/2]); input_im128=_mm_mul_ps(input_im128,_mm_set1_ps(div));
#endif _mm_storeu_ps(&s_re[aa][4*i],input_re128);
k+=2; _mm_storeu_ps(&s_im[aa][4*i],input_im128);
if (k==12*2*ofdm_symbol_size)
k=0; if (2*i+input_offset==12*2*ofdm_symbol_size)
i=0;
} }
} }
// printf("ener %e\n",signal_energy_fp(s_re,s_im,nb_tx_antennas,length,0)); // printf("ener %e\n",signal_energy_fp(s_re,s_im,nb_tx_antennas,length,0));
return(signal_energy_fp(s_re,s_im,nb_tx_antennas,length_meas,0)/NB_RE); return(signal_energy_fp_SSE_float(s_re,s_im,nb_tx_antennas,length_meas,0)/NB_RE);
} }
...@@ -17,6 +17,12 @@ ...@@ -17,6 +17,12 @@
*------------------------------------------------------------------------------- *-------------------------------------------------------------------------------
* For more information about the OpenAirInterface (OAI) Software Alliance: * For more information about the OpenAirInterface (OAI) Software Alliance:
* contact@openairinterface.org * contact@openairinterface.org
*-------------------------------------------------------------------------------
* Optimization using SIMD instructions
* Frecuency Domain Analysis
* Luis Felipe Ariza Vesga, email:lfarizav@unal.edu.co
* Functions: rf_rx_simple_freq(), rf_rx_simple_freq_SSE_float().
*-------------------------------------------------------------------------------
*/ */
//#include "defs.h" //#include "defs.h"
...@@ -294,17 +300,15 @@ clock_t start=clock();*/ ...@@ -294,17 +300,15 @@ clock_t start=clock();*/
} }
else else
{ {
//rx128_gain_lin=mm_mul_set1_ps(rx_gain_lin);
rx128_re = _mm_loadu_pd(&r_re[a][2*i]);//r_re[a][i],r_re[a][i+1] rx128_re = _mm_loadu_pd(&r_re[a][2*i]);//r_re[a][i],r_re[a][i+1]
rx128_im = _mm_loadu_pd(&r_im[a][2*i]);//r_im[a][i],r_im[a][i+1] rx128_im = _mm_loadu_pd(&r_im[a][2*i]);//r_im[a][i],r_im[a][i+1]
rx128_gain_lin = _mm_set1_pd(rx_gain_lin); rx128_gain_lin = _mm_set1_pd(rx_gain_lin);
gauss_0_128_sqrt_NOW = _mm_set_pd(ziggurat(0.0,1.0),ziggurat(0.0,1.0)); //gauss_0_128_sqrt_NOW = _mm_set_pd(ziggurat(0.0,1.0),ziggurat(0.0,1.0));
gauss_1_128_sqrt_NOW = _mm_set_pd(ziggurat(0.0,1.0),ziggurat(0.0,1.0)); //gauss_1_128_sqrt_NOW = _mm_set_pd(ziggurat(0.0,1.0),ziggurat(0.0,1.0));
boxmuller_SSE_float(&gauss_0_128_sqrt_NOW, &gauss_1_128_sqrt_NOW);
gauss_0_128_sqrt_NOW = _mm_mul_pd(gauss_0_128_sqrt_NOW,_mm_set1_pd(sqrt_NOW)); gauss_0_128_sqrt_NOW = _mm_mul_pd(gauss_0_128_sqrt_NOW,_mm_set1_pd(sqrt_NOW));
gauss_1_128_sqrt_NOW = _mm_mul_pd(gauss_1_128_sqrt_NOW,_mm_set1_pd(sqrt_NOW)); gauss_1_128_sqrt_NOW = _mm_mul_pd(gauss_1_128_sqrt_NOW,_mm_set1_pd(sqrt_NOW));
// Amplify by receiver gain and apply 3rd order non-linearity // Amplify by receiver gain and apply 3rd order non-linearity
//r_re[a][i] = rx_gain_lin*(r_re[a][i] + sqrt(.5*N0W)*gaussdouble(0.0,1.0));
//r_im[a][i] = rx_gain_lin*(r_im[a][i] + sqrt(.5*N0W)*gaussdouble(0.0,1.0));
rx128_re = _mm_add_pd(rx128_re,gauss_0_128_sqrt_NOW); rx128_re = _mm_add_pd(rx128_re,gauss_0_128_sqrt_NOW);
rx128_im = _mm_add_pd(rx128_im,gauss_1_128_sqrt_NOW); rx128_im = _mm_add_pd(rx128_im,gauss_1_128_sqrt_NOW);
rx128_re = _mm_mul_pd(rx128_re,rx128_gain_lin); rx128_re = _mm_mul_pd(rx128_re,rx128_gain_lin);
...@@ -419,7 +423,6 @@ clock_t start=clock();*/ ...@@ -419,7 +423,6 @@ clock_t start=clock();*/
} }
else else
{*/ {*/
//rx128_gain_lin=mm_mul_set1_ps(rx_gain_lin);
rx128_re = _mm_loadu_ps(&r_re[a][4*i]);//r_re[a][i],r_re[a][i+1] rx128_re = _mm_loadu_ps(&r_re[a][4*i]);//r_re[a][i],r_re[a][i+1]
rx128_im = _mm_loadu_ps(&r_im[a][4*i]);//r_im[a][i],r_im[a][i+1] rx128_im = _mm_loadu_ps(&r_im[a][4*i]);//r_im[a][i],r_im[a][i+1]
//start_meas(&desc->ziggurat); //start_meas(&desc->ziggurat);
...@@ -427,17 +430,9 @@ clock_t start=clock();*/ ...@@ -427,17 +430,9 @@ clock_t start=clock();*/
//gauss_1_128_sqrt_NOW = _mm_set_ps(ziggurat(0.0,1.0),ziggurat(0.0,1.0),ziggurat(0.0,1.0),ziggurat(0.0,1.0)); //gauss_1_128_sqrt_NOW = _mm_set_ps(ziggurat(0.0,1.0),ziggurat(0.0,1.0),ziggurat(0.0,1.0),ziggurat(0.0,1.0));
boxmuller_SSE_float(&gauss_0_128_sqrt_NOW, &gauss_1_128_sqrt_NOW); boxmuller_SSE_float(&gauss_0_128_sqrt_NOW, &gauss_1_128_sqrt_NOW);
//stop_meas(&desc->ziggurat); //stop_meas(&desc->ziggurat);
//_mm_storeu_ps(out,gauss_0_128_sqrt_NOW);
//_mm_storeu_ps(out1,gauss_1_128_sqrt_NOW);
//printf("Ziggurat is %e,%e,%e,%e\n",ziggurat(0.0,1.0),ziggurat(0.0,1.0),ziggurat(0.0,1.0),ziggurat(0.0,1.0));
//printf("boxmuller is %e,%e,%e,%e\n",gaussdouble(0.0,1.0),gaussdouble(0.0,1.0),gaussdouble(0.0,1.0),gaussdouble(0.0,1.0));
//printf("NOR SSE is %e,%e,%e,%e\n",out[0],out[1],out[2],out[3]);
//printf("NOR SSE1 is %e,%e,%e,%e\n",out1[0],out1[1],out1[2],out1[3]);
gauss_0_128_sqrt_NOW = _mm_mul_ps(gauss_0_128_sqrt_NOW,_mm_set1_ps(sqrt_NOW)); gauss_0_128_sqrt_NOW = _mm_mul_ps(gauss_0_128_sqrt_NOW,_mm_set1_ps(sqrt_NOW));
gauss_1_128_sqrt_NOW = _mm_mul_ps(gauss_1_128_sqrt_NOW,_mm_set1_ps(sqrt_NOW)); gauss_1_128_sqrt_NOW = _mm_mul_ps(gauss_1_128_sqrt_NOW,_mm_set1_ps(sqrt_NOW));
// Amplify by receiver gain and apply 3rd order non-linearity // Amplify by receiver gain and apply 3rd order non-linearity
//r_re[a][i] = rx_gain_lin*(r_re[a][i] + sqrt(.5*N0W)*gaussdouble(0.0,1.0));
//r_im[a][i] = rx_gain_lin*(r_im[a][i] + sqrt(.5*N0W)*gaussdouble(0.0,1.0));
rx128_re = _mm_add_ps(_mm_mul_ps(rx128_re,rx128_gain_lin),gauss_0_128_sqrt_NOW); rx128_re = _mm_add_ps(_mm_mul_ps(rx128_re,rx128_gain_lin),gauss_0_128_sqrt_NOW);
rx128_im = _mm_add_ps(_mm_mul_ps(rx128_im,rx128_gain_lin),gauss_1_128_sqrt_NOW); rx128_im = _mm_add_ps(_mm_mul_ps(rx128_im,rx128_gain_lin),gauss_1_128_sqrt_NOW);
_mm_storeu_ps(&r_re[a][4*i],rx128_re); _mm_storeu_ps(&r_re[a][4*i],rx128_re);
...@@ -445,6 +440,29 @@ clock_t start=clock();*/ ...@@ -445,6 +440,29 @@ clock_t start=clock();*/
//} //}
} }
} }
/*rx128_re = _mm_loadu_ps(&r_re[a][4*i+ofdm_symbol_size*j]);//r_re[a][i],r_re[a][i+1]
rx128_im = _mm_loadu_ps(&r_im[a][4*i+ofdm_symbol_size*j]);//r_im[a][i],r_im[a][i+1]
rx128_re_1 = _mm_loadu_ps(&r_re[a][(ofdm_symbol_size-n_samples)+4*i+ofdm_symbol_size*j]);//r_re[a][i],r_re[a][i+1]
rx128_im_1 = _mm_loadu_ps(&r_im[a][(ofdm_symbol_size-n_samples)+4*i+ofdm_symbol_size*j]);//r_im[a][i],r_im[a][i+1]
//start_meas(&desc->ziggurat);
//gauss_0_128_sqrt_NOW = _mm_set_ps(ziggurat(0.0,1.0),ziggurat(0.0,1.0),ziggurat(0.0,1.0),ziggurat(0.0,1.0));
//gauss_1_128_sqrt_NOW = _mm_set_ps(ziggurat(0.0,1.0),ziggurat(0.0,1.0),ziggurat(0.0,1.0),ziggurat(0.0,1.0));
boxmuller_SSE_float(&gauss_0_128_sqrt_NOW, &gauss_1_128_sqrt_NOW);
boxmuller_SSE_float(&gauss_0_128_sqrt_NOW_1, &gauss_1_128_sqrt_NOW_1);
//stop_meas(&desc->ziggurat);
gauss_0_128_sqrt_NOW = _mm_mul_ps(gauss_0_128_sqrt_NOW,_mm_set1_ps(sqrt_NOW));
gauss_1_128_sqrt_NOW = _mm_mul_ps(gauss_1_128_sqrt_NOW,_mm_set1_ps(sqrt_NOW));
gauss_0_128_sqrt_NOW_1 = _mm_mul_ps(gauss_0_128_sqrt_NOW_1,_mm_set1_ps(sqrt_NOW));
gauss_1_128_sqrt_NOW_1 = _mm_mul_ps(gauss_1_128_sqrt_NOW_1,_mm_set1_ps(sqrt_NOW));
// Amplify by receiver gain and apply 3rd order non-linearity
rx128_re = _mm_add_ps(_mm_mul_ps(rx128_re,rx128_gain_lin),gauss_0_128_sqrt_NOW);
rx128_im = _mm_add_ps(_mm_mul_ps(rx128_im,rx128_gain_lin),gauss_1_128_sqrt_NOW);
rx128_re_1 = _mm_add_ps(_mm_mul_ps(rx128_re_1,rx128_gain_lin),gauss_0_128_sqrt_NOW_1);
rx128_im_1 = _mm_add_ps(_mm_mul_ps(rx128_im_1,rx128_gain_lin),gauss_1_128_sqrt_NOW_1);
_mm_storeu_ps(&r_re[a][4*i+ofdm_symbol_size*j],rx128_re);
_mm_storeu_ps(&r_im[a][4*i+ofdm_symbol_size*j],rx128_im);
_mm_storeu_ps(&r_re[a][(ofdm_symbol_size-n_samples)+4*i+ofdm_symbol_size*j],rx128_re_1);
_mm_storeu_ps(&r_im[a][(ofdm_symbol_size-n_samples)+4*i+ofdm_symbol_size*j],rx128_im_1);*/
/*clock_t stop=clock(); /*clock_t stop=clock();
printf("do_DL_sig time is %f s, AVERAGE time is %f s, count %d, sum %e\n",(float) (stop-start)/CLOCKS_PER_SEC,(float) (sum+stop-start)/(count*CLOCKS_PER_SEC),count,sum+stop-start); printf("do_DL_sig time is %f s, AVERAGE time is %f s, count %d, sum %e\n",(float) (stop-start)/CLOCKS_PER_SEC,(float) (sum+stop-start)/(count*CLOCKS_PER_SEC),count,sum+stop-start);
sum=(sum+stop-start);*/ sum=(sum+stop-start);*/
......
This diff is collapsed.
This diff is collapsed.
This diff is collapsed.
...@@ -17,6 +17,13 @@ ...@@ -17,6 +17,13 @@
*------------------------------------------------------------------------------- *-------------------------------------------------------------------------------
* For more information about the OpenAirInterface (OAI) Software Alliance: * For more information about the OpenAirInterface (OAI) Software Alliance:
* contact@openairinterface.org * contact@openairinterface.org
*-------------------------------------------------------------------------------
* Optimization using SIMD instructions
* Frecuency Domain Analysis
* Luis Felipe Ariza Vesga, email:lfarizav@unal.edu.co
* Functions: do_DL_sig_freq(), do_UL_sig_freq(), init_channel_vars_freq(),
* do_UL_sig_freq_prach.
*-------------------------------------------------------------------------------
*/ */
#include <string.h> #include <string.h>
......
Markdown is supported
0%
or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment