Commit d243b72e authored by Luis Ariza's avatar Luis Ariza

Starting Ziggurat version SSE2 coding

parent cd07e9eb
......@@ -1155,31 +1155,7 @@ int phy_init_lte_ue(PHY_VARS_UE *ue,
for (i=0; i<fp->nb_antennas_rx; i++) {
common_vars->rxdata[i] = (int32_t*) malloc16_clear( (fp->samples_per_tti*10+2048)*sizeof(int32_t) );
}
/*if (do_ofdm_mod)
{
common_vars->common_vars_rx_data_per_thread[0].rxdataF = (int32_t**)malloc16( fp->nb_antennas_rx*sizeof(int32_t*));
printf("[lte_init_f] address of rxdataF in memory: %p, thread %d\n",&common_vars->common_vars_rx_data_per_thread[0].rxdataF,0);
for (i=0; i<fp->nb_antennas_rx; i++) {
common_vars->common_vars_rx_data_per_thread[0].rxdataF[i] = (int32_t*)malloc16_clear((10*fp->ofdm_symbol_size*fp->symbols_per_tti)*sizeof(int32_t));
printf("[lte_init_f] address of rxdataF[i] in memory: %p, thread %d, antenna %d\n",&common_vars->common_vars_rx_data_per_thread[0].rxdataF[i],0,i);
}
//rxdata_temp. Please remove this dummy allocation when all arrays are size fixed. If not, the multichannel does not work.
rxdataF_temp = (int32_t**)malloc16( fp->nb_antennas_rx*sizeof(int32_t*) );
for (i=0; i<fp->nb_antennas_rx; i++) {
rxdataF_temp[i] = (int32_t*)malloc16_clear((14313)*sizeof(int32_t));
}
common_vars->common_vars_rx_data_per_thread[1].rxdataF = (int32_t**)malloc16( fp->nb_antennas_rx*sizeof(int32_t*) );
printf("[lte_init_f] address of rxdataF in memory: %p, thread %d\n",&common_vars->common_vars_rx_data_per_thread[1].rxdataF,1);
for (i=0; i<fp->nb_antennas_rx; i++) {
common_vars->common_vars_rx_data_per_thread[1].rxdataF[i] = (int32_t*)malloc16_clear((10*fp->ofdm_symbol_size*fp->symbols_per_tti)*sizeof(int32_t));
printf("[lte_init_f address of rxdataF[i] in memory: %p, thread %d, antenna %d\n",&common_vars->common_vars_rx_data_per_thread[1].rxdataF[i],1,i);
}
}
else
//{*/
for (th_id=0; th_id<RX_NB_TH_MAX; th_id++){
common_vars->common_vars_rx_data_per_thread[th_id].rxdataF = (int32_t**)malloc16( fp->nb_antennas_rx*sizeof(int32_t*) );
printf("[lte_init_t] address of rxdataF in memory: %p, thread %d\n",&common_vars->common_vars_rx_data_per_thread[th_id].rxdataF,th_id);
......@@ -1208,19 +1184,6 @@ int phy_init_lte_ue(PHY_VARS_UE *ue,
}
}
}
/*for (i=0; i<fp->ofdm_symbol_size*fp->symbols_per_tti; i++) {
for (int aa=0; aa<fp->nb_antennas_rx; aa++) {
((short *)common_vars->common_vars_rx_data_per_thread[1].rxdataF[aa])[((i+fp->ofdm_symbol_size*fp->symbols_per_tti)<<1)] = (short)(i);
((short *)common_vars->common_vars_rx_data_per_thread[1].rxdataF[aa])[1+((i+fp->ofdm_symbol_size*fp->symbols_per_tti)<<1)] = (short)(1);
if (i < 300) {
printf("[lte-init] rxdataF (thread[%d]) %d: (%d,%d), memory addr %p\n",1,i,((short *)common_vars->common_vars_rx_data_per_thread[1].rxdataF[aa])[((i+fp->ofdm_symbol_size*fp->symbols_per_tti)<<1)],((short *)common_vars->common_vars_rx_data_per_thread[1].rxdataF[aa])[1+((i+fp->ofdm_symbol_size*fp->symbols_per_tti)<<1)],&common_vars->common_vars_rx_data_per_thread[1].rxdataF[aa][1+((i+fp->ofdm_symbol_size*fp->symbols_per_tti)<<1)]);
printf("[lte-init] rxdataF (thread[%d]) %d: (%d,%d), memory addr %p\n",0,i,((short *)common_vars->common_vars_rx_data_per_thread[0].rxdataF[aa])[((i+4+2*fp->ofdm_symbol_size*fp->symbols_per_tti)<<1)],((short *)common_vars->common_vars_rx_data_per_thread[0].rxdataF[aa])[1+((i+4+2*fp->ofdm_symbol_size*fp->symbols_per_tti)<<1)],&common_vars->common_vars_rx_data_per_thread[0].rxdataF[aa][1+((i+4+2*fp->ofdm_symbol_size*fp->symbols_per_tti)<<1)]);
}
}
}*/
}
// DLSCH
......
......@@ -17,6 +17,12 @@
*-------------------------------------------------------------------------------
* For more information about the OpenAirInterface (OAI) Software Alliance:
* contact@openairinterface.org
*-------------------------------------------------------------------------------
* Optimization using SIMD instructions
* Frecuency Domain Analysis
* Luis Felipe Ariza Vesga, email:lfarizav@unal.edu.co
* Functions: adc_SSE_float(), adc_prach(), adc_prach_SSE_float().
*-------------------------------------------------------------------------------
*/
#include <string.h>
#include <stdio.h>
......@@ -154,59 +160,7 @@ void adc_freq(double *r_re[2],
//printf("Adc outputs %d %e %d \n",i,((short *)output[0])[((i+output_offset)<<1)], ((i+output_offset)<<1) );
}
}
/*void adc_freq(double *r_re[2],
double *r_im[2],
unsigned int input_offset,
unsigned int output_offset,
unsigned int **output1,//thread th_id
unsigned int **output2,//thread 0
unsigned int **output3,//thread 1
unsigned int nb_rx_antennas,
unsigned int length,
unsigned char B,
int thread)
{
int i;
//int th_id;
int aa;
double gain = (double)(1<<(B-1));*/
/*int dummy_rx[nb_rx_antennas][length] __attribute__((aligned(32)));
for (aa=0; aa<nb_rx_antennas; aa++) {
memset (&output1[aa][output_offset],0,length*sizeof(int));
}*/
//double gain = 1.0;
/*for (i=0; i<length; i++) {
for (aa=0; aa<nb_rx_antennas; aa++) {
((short *)output1[aa])[((i+output_offset)<<1)] = (short)(r_re[aa][i+input_offset]*gain);
((short *)output1[aa])[1+((i+output_offset)<<1)] = (short)(r_im[aa][i+input_offset]*gain);
if ((r_re[aa][i+input_offset]*gain) > 30000) {
//("Adc outputs %d %e %d \n",i,((short *)output[0])[((i+output_offset)<<1)], ((i+output_offset)<<1) );
}
if (i < 300) {
printf("rxdataF (thread[%d]) %d: (%d,%d)\n",thread,i,((short *)output1[aa])[((i+output_offset)<<1)],((short *)output1[aa])[1+((i+output_offset)<<1)]);
if (thread==0 && output_offset>length)
printf("rxdataF (thread[1]) %d: (%d,%d) \n",i,((short *)output3[aa])[((i+output_offset-length-4)<<1)],((short *)output3[aa])[1+((i+output_offset-length-4)<<1)]);
else if (thread==1)
printf("rxdataF (thread[0]) %d: (%d,%d) \n",i,((short *)output2[aa])[((i+output_offset+length+4)<<1)],((short *)output2[aa])[1+((i+output_offset+length+4)<<1)]);
}
}*/
/*for (aa=0; aa<nb_rx_antennas; aa++) {
for (th_id=1; th_id<2; th_id++)
{
memcpy((void *)output[aa][output_offset],
(void *)output[aa][output_offset],
length*sizeof(int));
}
}*/
/*}
printf("thread %d\n",(unsigned int)thread);
//write_output("adc_rxsigF_frame0.m","adc_rxsF0", output1[0],10*length,1,16);
//write_output("adc_rxsigF_frame1.m","adc_rxsF1", output2[0],10*length,1,16);
}*/
void adc_prach(double *r_re[2],
double *r_im[2],
unsigned int input_offset,
......@@ -251,23 +205,82 @@ void adc_prach_SSE_float(float *r_re[2],
int i;
int aa;
double gain = (double)(1<<(B-1));
__m128 r_re128,r_im128,gain128;
__m128i r_re128i, r_im128i,output128;
float gain = (double)(1<<(B-1));
gain128=_mm_set1_ps(gain);
//double gain = 1.0;
for (i=0; i<length; i++) {
for (i=0; i<(length>>2); i++) {
for (aa=0; aa<nb_rx_antennas; aa++) {
((short *)output[aa])[((i+output_offset/2)<<1)] = (short)(r_re[aa][i+input_offset]*gain);
((short *)output[aa])[1+((i+output_offset/2)<<1)] = (short)(r_im[aa][i+input_offset]*gain);
#ifdef DEBUG_ADC
if (i<10)
printf("[adc_prach]i %d. input (%d,%d), output (%d,%d)\n",i,(short)(r_re[aa][i+input_offset]),(short)(r_im[aa][i+input_offset]),((short *)output[aa])[((i+output_offset/2)<<1)],((short *)output[aa])[1+((i+output_offset/2)<<1)]);
if (i>length-10&&i<length)
#endif
if ((r_re[aa][i+input_offset]*gain) > 30000) {
//("Adc outputs %d %e %d \n",i,((short *)output[0])[((i+output_offset)<<1)], ((i+output_offset)<<1) );
}
//((short *)output[aa])[((i+output_offset/2)<<1)] = (short)(r_re[aa][i+input_offset]*gain);
//((short *)output[aa])[1+((i+output_offset/2)<<1)] = (short)(r_im[aa][i+input_offset]*gain);
r_re128=_mm_loadu_ps(&r_re[aa][4*i+input_offset]);
r_im128=_mm_loadu_ps(&r_im[aa][4*i+input_offset]);
r_re128=_mm_mul_ps(r_re128,gain128);
r_im128=_mm_mul_ps(r_im128,gain128);
r_re128i=_mm_cvtps_epi32(r_re128);
r_im128i=_mm_cvtps_epi32(r_im128);
r_re128i=_mm_packs_epi32(r_re128i,r_re128i);
r_im128i=_mm_packs_epi32(r_im128i,r_im128i);
output128=_mm_unpacklo_epi16(r_re128i,r_im128i);
_mm_storeu_si128((__m128i *)&output[aa][4*i+output_offset/2],output128);
}
//printf("Adc outputs %d %e %d \n",i,((short *)output[0])[((i+output_offset)<<1)], ((i+output_offset)<<1) );
}
}
/*void adc_freq(double *r_re[2],
double *r_im[2],
unsigned int input_offset,
unsigned int output_offset,
unsigned int **output1,//thread th_id
unsigned int **output2,//thread 0
unsigned int **output3,//thread 1
unsigned int nb_rx_antennas,
unsigned int length,
unsigned char B,
int thread)
{
int i;
//int th_id;
int aa;
double gain = (double)(1<<(B-1));*/
/*int dummy_rx[nb_rx_antennas][length] __attribute__((aligned(32)));
for (aa=0; aa<nb_rx_antennas; aa++) {
memset (&output1[aa][output_offset],0,length*sizeof(int));
}*/
//double gain = 1.0;
/*for (i=0; i<length; i++) {
for (aa=0; aa<nb_rx_antennas; aa++) {
((short *)output1[aa])[((i+output_offset)<<1)] = (short)(r_re[aa][i+input_offset]*gain);
((short *)output1[aa])[1+((i+output_offset)<<1)] = (short)(r_im[aa][i+input_offset]*gain);
if ((r_re[aa][i+input_offset]*gain) > 30000) {
//("Adc outputs %d %e %d \n",i,((short *)output[0])[((i+output_offset)<<1)], ((i+output_offset)<<1) );
}
if (i < 300) {
printf("rxdataF (thread[%d]) %d: (%d,%d)\n",thread,i,((short *)output1[aa])[((i+output_offset)<<1)],((short *)output1[aa])[1+((i+output_offset)<<1)]);
if (thread==0 && output_offset>length)
printf("rxdataF (thread[1]) %d: (%d,%d) \n",i,((short *)output3[aa])[((i+output_offset-length-4)<<1)],((short *)output3[aa])[1+((i+output_offset-length-4)<<1)]);
else if (thread==1)
printf("rxdataF (thread[0]) %d: (%d,%d) \n",i,((short *)output2[aa])[((i+output_offset+length+4)<<1)],((short *)output2[aa])[1+((i+output_offset+length+4)<<1)]);
}
}*/
/*for (aa=0; aa<nb_rx_antennas; aa++) {
for (th_id=1; th_id<2; th_id++)
{
memcpy((void *)output[aa][output_offset],
(void *)output[aa][output_offset],
length*sizeof(int));
}
}*/
/*}
printf("thread %d\n",(unsigned int)thread);
//write_output("adc_rxsigF_frame0.m","adc_rxsF0", output1[0],10*length,1,16);
//write_output("adc_rxsigF_frame1.m","adc_rxsF1", output2[0],10*length,1,16);
}*/
......@@ -17,6 +17,12 @@
*-------------------------------------------------------------------------------
* For more information about the OpenAirInterface (OAI) Software Alliance:
* contact@openairinterface.org
*-------------------------------------------------------------------------------
* Optimization using SIMD instructions
* Frecuency Domain Analysis
* Luis Felipe Ariza Vesga, email:lfarizav@unal.edu.co
* Functions: dac_fixed_gain_SSE_float(), dac_fixed_gain_prach(), dac_fixed_gain_prach_SSE_float().
*-------------------------------------------------------------------------------
*/
//#define DEBUG_DAC
......@@ -297,8 +303,8 @@ float dac_fixed_gain_prach_SSE_float(float *s_re[2],
int i;
int aa;
float amp,amp1;
int k;
float amp,amp1,div;
__m128 input_re128, input_im128;
amp = //sqrt(NB_RE)*pow(10.0,.05*txpwr_dBm)/sqrt(nb_tx_antennas); //this is amp per tx antenna
pow(10.0,.05*txpwr_dBm)/sqrt(nb_tx_antennas); //this is amp per tx antenna
......@@ -322,27 +328,26 @@ float dac_fixed_gain_prach_SSE_float(float *s_re[2],
amp1 = amp1*sqrt(512.0/300.0); //account for loss due to null carriers
//printf("DL: amp1 %f dB (%d,%d), tx_power %f\n",20*log10(amp1),input_offset,input_offset_meas,txpwr_dBm);
*/
#ifdef DEBUG_DAC
printf("DAC: input_offset %d, amp %e, amp1 %e\n",input_offset,amp,amp1);
#endif
k=input_offset;
for (i=0; i<length*2; i+=2) {
div=amp/amp1;
for (i=0; i<(length>>2); i++) {
for (aa=0; aa<nb_tx_antennas; aa++) {
s_re[aa][i/2] = amp*((float)(((short *)input))[((k))])/amp1; ///(1<<(B-1));
s_im[aa][i/2] = amp*((float)(((short *)input))[((k))+1])/amp1; ///(1<<(B-1));
#ifdef DEBUG_DAC
if(i<20)
printf("DAC[%d]: input (%d,%d). output (%e,%e)\n",i/2,(((short *)input))[((k))],(((short *)input))[((k))+1],s_re[aa][i/2],s_im[aa][i/2]);
if (i>length*2-20&&i<length*2)
printf("DAC[%d]: input (%d,%d). output (%e,%e)\n",i/2,(((short *)input))[((k))],(((short *)input))[((k))+1],s_re[aa][i/2],s_im[aa][i/2]);
#endif
k+=2;
if (k==12*2*ofdm_symbol_size)
k=0;
//s_re[aa][i] = div*((float)(((short *)input))[((input_offset+2*i))]); ///(1<<(B-1));
//s_im[aa][i] = div*((float)(((short *)input))[((input_offset+2*i))+1]); ///(1<<(B-1));
input_re128=_mm_set_ps((float)(((short *)input))[2*(4*i+3)+input_offset],(float)(((short *)input))[2*(4*i+2)+input_offset],(float)(((short *)input))[2*(4*i+1)+input_offset],(float)(((short *)input))[2*(4*i)+input_offset]);
input_im128=_mm_set_ps((float)(((short *)input))[2*(4*i+3)+1+input_offset],(float)(((short *)input))[2*(4*i+2)+1+input_offset],(float)(((short *)input))[2*(4*i+1)+1+input_offset],(float)(((short *)input))[2*(4*i)+1+input_offset]);
input_re128=_mm_mul_ps(input_re128,_mm_set1_ps(div));
input_im128=_mm_mul_ps(input_im128,_mm_set1_ps(div));
_mm_storeu_ps(&s_re[aa][4*i],input_re128);
_mm_storeu_ps(&s_im[aa][4*i],input_im128);
if (2*i+input_offset==12*2*ofdm_symbol_size)
i=0;
}
}
// printf("ener %e\n",signal_energy_fp(s_re,s_im,nb_tx_antennas,length,0));
return(signal_energy_fp(s_re,s_im,nb_tx_antennas,length_meas,0)/NB_RE);
return(signal_energy_fp_SSE_float(s_re,s_im,nb_tx_antennas,length_meas,0)/NB_RE);
}
......@@ -17,6 +17,12 @@
*-------------------------------------------------------------------------------
* For more information about the OpenAirInterface (OAI) Software Alliance:
* contact@openairinterface.org
*-------------------------------------------------------------------------------
* Optimization using SIMD instructions
* Frecuency Domain Analysis
* Luis Felipe Ariza Vesga, email:lfarizav@unal.edu.co
* Functions: rf_rx_simple_freq(), rf_rx_simple_freq_SSE_float().
*-------------------------------------------------------------------------------
*/
//#include "defs.h"
......@@ -294,17 +300,15 @@ clock_t start=clock();*/
}
else
{
//rx128_gain_lin=mm_mul_set1_ps(rx_gain_lin);
rx128_re = _mm_loadu_pd(&r_re[a][2*i]);//r_re[a][i],r_re[a][i+1]
rx128_im = _mm_loadu_pd(&r_im[a][2*i]);//r_im[a][i],r_im[a][i+1]
rx128_gain_lin = _mm_set1_pd(rx_gain_lin);
gauss_0_128_sqrt_NOW = _mm_set_pd(ziggurat(0.0,1.0),ziggurat(0.0,1.0));
gauss_1_128_sqrt_NOW = _mm_set_pd(ziggurat(0.0,1.0),ziggurat(0.0,1.0));
//gauss_0_128_sqrt_NOW = _mm_set_pd(ziggurat(0.0,1.0),ziggurat(0.0,1.0));
//gauss_1_128_sqrt_NOW = _mm_set_pd(ziggurat(0.0,1.0),ziggurat(0.0,1.0));
boxmuller_SSE_float(&gauss_0_128_sqrt_NOW, &gauss_1_128_sqrt_NOW);
gauss_0_128_sqrt_NOW = _mm_mul_pd(gauss_0_128_sqrt_NOW,_mm_set1_pd(sqrt_NOW));
gauss_1_128_sqrt_NOW = _mm_mul_pd(gauss_1_128_sqrt_NOW,_mm_set1_pd(sqrt_NOW));
// Amplify by receiver gain and apply 3rd order non-linearity
//r_re[a][i] = rx_gain_lin*(r_re[a][i] + sqrt(.5*N0W)*gaussdouble(0.0,1.0));
//r_im[a][i] = rx_gain_lin*(r_im[a][i] + sqrt(.5*N0W)*gaussdouble(0.0,1.0));
rx128_re = _mm_add_pd(rx128_re,gauss_0_128_sqrt_NOW);
rx128_im = _mm_add_pd(rx128_im,gauss_1_128_sqrt_NOW);
rx128_re = _mm_mul_pd(rx128_re,rx128_gain_lin);
......@@ -419,7 +423,6 @@ clock_t start=clock();*/
}
else
{*/
//rx128_gain_lin=mm_mul_set1_ps(rx_gain_lin);
rx128_re = _mm_loadu_ps(&r_re[a][4*i]);//r_re[a][i],r_re[a][i+1]
rx128_im = _mm_loadu_ps(&r_im[a][4*i]);//r_im[a][i],r_im[a][i+1]
//start_meas(&desc->ziggurat);
......@@ -427,17 +430,9 @@ clock_t start=clock();*/
//gauss_1_128_sqrt_NOW = _mm_set_ps(ziggurat(0.0,1.0),ziggurat(0.0,1.0),ziggurat(0.0,1.0),ziggurat(0.0,1.0));
boxmuller_SSE_float(&gauss_0_128_sqrt_NOW, &gauss_1_128_sqrt_NOW);
//stop_meas(&desc->ziggurat);
//_mm_storeu_ps(out,gauss_0_128_sqrt_NOW);
//_mm_storeu_ps(out1,gauss_1_128_sqrt_NOW);
//printf("Ziggurat is %e,%e,%e,%e\n",ziggurat(0.0,1.0),ziggurat(0.0,1.0),ziggurat(0.0,1.0),ziggurat(0.0,1.0));
//printf("boxmuller is %e,%e,%e,%e\n",gaussdouble(0.0,1.0),gaussdouble(0.0,1.0),gaussdouble(0.0,1.0),gaussdouble(0.0,1.0));
//printf("NOR SSE is %e,%e,%e,%e\n",out[0],out[1],out[2],out[3]);
//printf("NOR SSE1 is %e,%e,%e,%e\n",out1[0],out1[1],out1[2],out1[3]);
gauss_0_128_sqrt_NOW = _mm_mul_ps(gauss_0_128_sqrt_NOW,_mm_set1_ps(sqrt_NOW));
gauss_1_128_sqrt_NOW = _mm_mul_ps(gauss_1_128_sqrt_NOW,_mm_set1_ps(sqrt_NOW));
// Amplify by receiver gain and apply 3rd order non-linearity
//r_re[a][i] = rx_gain_lin*(r_re[a][i] + sqrt(.5*N0W)*gaussdouble(0.0,1.0));
//r_im[a][i] = rx_gain_lin*(r_im[a][i] + sqrt(.5*N0W)*gaussdouble(0.0,1.0));
rx128_re = _mm_add_ps(_mm_mul_ps(rx128_re,rx128_gain_lin),gauss_0_128_sqrt_NOW);
rx128_im = _mm_add_ps(_mm_mul_ps(rx128_im,rx128_gain_lin),gauss_1_128_sqrt_NOW);
_mm_storeu_ps(&r_re[a][4*i],rx128_re);
......@@ -445,6 +440,29 @@ clock_t start=clock();*/
//}
}
}
/*rx128_re = _mm_loadu_ps(&r_re[a][4*i+ofdm_symbol_size*j]);//r_re[a][i],r_re[a][i+1]
rx128_im = _mm_loadu_ps(&r_im[a][4*i+ofdm_symbol_size*j]);//r_im[a][i],r_im[a][i+1]
rx128_re_1 = _mm_loadu_ps(&r_re[a][(ofdm_symbol_size-n_samples)+4*i+ofdm_symbol_size*j]);//r_re[a][i],r_re[a][i+1]
rx128_im_1 = _mm_loadu_ps(&r_im[a][(ofdm_symbol_size-n_samples)+4*i+ofdm_symbol_size*j]);//r_im[a][i],r_im[a][i+1]
//start_meas(&desc->ziggurat);
//gauss_0_128_sqrt_NOW = _mm_set_ps(ziggurat(0.0,1.0),ziggurat(0.0,1.0),ziggurat(0.0,1.0),ziggurat(0.0,1.0));
//gauss_1_128_sqrt_NOW = _mm_set_ps(ziggurat(0.0,1.0),ziggurat(0.0,1.0),ziggurat(0.0,1.0),ziggurat(0.0,1.0));
boxmuller_SSE_float(&gauss_0_128_sqrt_NOW, &gauss_1_128_sqrt_NOW);
boxmuller_SSE_float(&gauss_0_128_sqrt_NOW_1, &gauss_1_128_sqrt_NOW_1);
//stop_meas(&desc->ziggurat);
gauss_0_128_sqrt_NOW = _mm_mul_ps(gauss_0_128_sqrt_NOW,_mm_set1_ps(sqrt_NOW));
gauss_1_128_sqrt_NOW = _mm_mul_ps(gauss_1_128_sqrt_NOW,_mm_set1_ps(sqrt_NOW));
gauss_0_128_sqrt_NOW_1 = _mm_mul_ps(gauss_0_128_sqrt_NOW_1,_mm_set1_ps(sqrt_NOW));
gauss_1_128_sqrt_NOW_1 = _mm_mul_ps(gauss_1_128_sqrt_NOW_1,_mm_set1_ps(sqrt_NOW));
// Amplify by receiver gain and apply 3rd order non-linearity
rx128_re = _mm_add_ps(_mm_mul_ps(rx128_re,rx128_gain_lin),gauss_0_128_sqrt_NOW);
rx128_im = _mm_add_ps(_mm_mul_ps(rx128_im,rx128_gain_lin),gauss_1_128_sqrt_NOW);
rx128_re_1 = _mm_add_ps(_mm_mul_ps(rx128_re_1,rx128_gain_lin),gauss_0_128_sqrt_NOW_1);
rx128_im_1 = _mm_add_ps(_mm_mul_ps(rx128_im_1,rx128_gain_lin),gauss_1_128_sqrt_NOW_1);
_mm_storeu_ps(&r_re[a][4*i+ofdm_symbol_size*j],rx128_re);
_mm_storeu_ps(&r_im[a][4*i+ofdm_symbol_size*j],rx128_im);
_mm_storeu_ps(&r_re[a][(ofdm_symbol_size-n_samples)+4*i+ofdm_symbol_size*j],rx128_re_1);
_mm_storeu_ps(&r_im[a][(ofdm_symbol_size-n_samples)+4*i+ofdm_symbol_size*j],rx128_im_1);*/
/*clock_t stop=clock();
printf("do_DL_sig time is %f s, AVERAGE time is %f s, count %d, sum %e\n",(float) (stop-start)/CLOCKS_PER_SEC,(float) (sum+stop-start)/(count*CLOCKS_PER_SEC),count,sum+stop-start);
sum=(sum+stop-start);*/
......
......@@ -17,6 +17,17 @@
*-------------------------------------------------------------------------------
* For more information about the OpenAirInterface (OAI) Software Alliance:
* contact@openairinterface.org
*-------------------------------------------------------------------------------
* Optimization using SIMD instructions
* Frecuency Domain Analysis
* Luis Felipe Ariza Vesga, email:lfarizav@unal.edu.co
* Functions: init_freq_channel_SSE_float(), freq_channel_SSE_float(),
* init_freq_channel_prach(), init_freq_channel_prach_SSE_float(),
* freq_channel_prach (), freq_channel_prach_SSE_float().
*
* sincos_ps(), log_ps() --> Functions mofified from Miloyip and Cephes sources.
* More info https://github.com/miloyip/normaldist-benchmark.
*-------------------------------------------------------------------------------
*/
#include <math.h>
......@@ -63,7 +74,6 @@ int init_freq_channel(channel_desc_t *desc,uint16_t nb_rb,int16_t n_samples)
}
first_run=0;
}
for (f=-(n_samples>>1); f<=(n_samples>>1); f++) {
//count++;
freq=delta_f*(double)f*1e-6;// due to the fact that delays is in mus
......@@ -75,11 +85,8 @@ int init_freq_channel(channel_desc_t *desc,uint16_t nb_rb,int16_t n_samples)
cos_lut[f+(n_samples>>1)][l] = cos(2*M_PI*freq*delay);
sin_lut[f+(n_samples>>1)][l] = sin(2*M_PI*freq*delay);
//printf("[%d][%d] (cos,sin) (%e,%e):\n",f,l,cos_lut[f+(n_samples>>1)][l],sin_lut[f+(n_samples>>1)][l]);
}
}
//printf("count %d\n",count);
return(0);
}
......@@ -100,8 +107,6 @@ int freq_channel(channel_desc_t *desc,uint16_t nb_rb,int16_t n_samples)
return(-1);
}
// printf("no of taps:%d,",(int)desc->nb_taps);
if (freq_channel_init == 0) {
// we are initializing the lut for the largets possible n_samples=12*nb_rb+1
// if called with n_samples<12*nb_rb+1, we decimate the lut
......@@ -113,9 +118,6 @@ int freq_channel(channel_desc_t *desc,uint16_t nb_rb,int16_t n_samples)
}
d=(n_samples_max-1)/(n_samples-1);
//printf("no_samples=%d, n_samples_max=%d, d=%d\n",n_samples,n_samples_max,d);
start_meas(&desc->interp_freq);
for (f=-(n_samples_max>>1),f2=-(n_samples>>1); f<(n_samples_max>>1); f+=d,f2++) {
......@@ -134,7 +136,6 @@ int freq_channel(channel_desc_t *desc,uint16_t nb_rb,int16_t n_samples)
desc->chF[aarx+(aatx*desc->nb_rx)][(n_samples>>1)+f2].y+=(-desc->a[l][aarx+(aatx*desc->nb_rx)].x*slut[l]+
desc->a[l][aarx+(aatx*desc->nb_rx)].y*clut[l]);
}
printf("f %d, (chF.x,chF.y) (%e,%e):\n",f,desc->chF[0][(n_samples>>1)+f].x,desc->chF[0][(n_samples>>1)+f].y);
}
}
}
......@@ -153,9 +154,6 @@ int init_freq_channel_SSE_float(channel_desc_t *desc,uint16_t nb_rb,int16_t n_sa
int16_t f;
uint8_t l;
__m128 cos_lut128,sin_lut128;
static float out[4] __attribute__((aligned(16)));
static float out1[4] __attribute__((aligned(16)));
//static int count=0;
if ((n_samples&1)==0) {
fprintf(stderr, "freq_channel_init: n_samples has to be odd\n");
......@@ -185,26 +183,11 @@ int init_freq_channel_SSE_float(channel_desc_t *desc,uint16_t nb_rb,int16_t n_sa
delay = desc->delays[l]+NB_SAMPLES_CHANNEL_OFFSET/desc->sampling_rate;
//sincos_ps(_mm_set_ps(twopi*(4*f+3)*delay,twopi*(4*f+2)*delay,twopi*(4*f+1)*delay,twopi*(4*f)*delay), &sin_lut128, &cos_lut128);
//_mm_storeu_ps(out,sin_lut128);
//printf("sin sincos SSE is %e,%e,%e,%e\n",out[0],out[1],out[2],out[3]);
cos_lut128=_mm_set_ps(cos(twopi*(4*f+3)*delay),cos(twopi*(4*f+2)*delay),cos(twopi*(4*f+1)*delay),cos(twopi*(4*f)*delay));
sin_lut128=_mm_set_ps(sin(twopi*(4*f+3)*delay),sin(twopi*(4*f+2)*delay),sin(twopi*(4*f+1)*delay),sin(twopi*(4*f)*delay));
//_mm_storeu_ps(out1,sin_lut128);
//printf("sin SSE1 is %e,%e,%e,%e\n",out1[0],out1[1],out1[2],out1[3]);
_mm_storeu_ps(&cos_lut_f[l][4*f+(n_samples>>1)],cos_lut128);
_mm_storeu_ps(&sin_lut_f[l][4*f+(n_samples>>1)],sin_lut128);
//cos_lut[f+(n_samples>>1)][l] = cos(2*M_PI*freq*delay);
//sin_lut[f+(n_samples>>1)][l] = sin(2*M_PI*freq*delay);
//printf("cos %e,%e,%e,%e\n",cos_lut128[0],cos_lut128[1],cos_lut128[2],cos_lut128[3]);
//printf("sin %e,%e,%e,%e\n",sin_lut128[0],sin_lut128[1],sin_lut128[2],sin_lut128[3]);
//printf("arg %e, f %d, values cos:%e, sin:%e, cos# %e, sin# %e\n",twopi*(4*f)*delay,4*f+(n_samples>>1), cos_lut_f[l][4*f+(n_samples>>1)], sin_lut_f[l][4*f+(n_samples>>1)],cos(twopi*(4*f)*delay),sin(twopi*(4*f)*delay));
//printf("arg %e, f %d, values cos:%e, sin:%e, cos# %e, sin# %e\n",twopi*(4*f+1)*delay,4*f+1+(n_samples>>1), cos_lut_f[l][4*f+1+(n_samples>>1)], sin_lut_f[l][4*f+1+(n_samples>>1)],cos(twopi*(4*f+1)*delay),sin(twopi*(4*f+1)*delay));
//printf("arg %e, f %d, values cos:%e, sin:%e, cos# %e, sin# %e\n",twopi*(4*f+2)*delay,4*f+2+(n_samples>>1), cos_lut_f[l][4*f+2+(n_samples>>1)], sin_lut_f[l][4*f+2+(n_samples>>1)],cos(twopi*(4*f+2)*delay),sin(twopi*(4*f+2)*delay));
//printf("arg %e, f %d, values cos:%e, sin:%e, cos# %e, sin# %e\n",twopi*(4*f+3)*delay,4*f+3+(n_samples>>1), cos_lut_f[l][4*f+3+(n_samples>>1)], sin_lut_f[l][4*f+3+(n_samples>>1)],cos(twopi*(4*f+3)*delay),sin(twopi*(4*f+3)*delay));
//printf("f %d, cos0 %e, cos1 %e\n",2*f,(double) &cos_lut128[0],(double) &cos_lut128[1]);
//printf("f %d, sin0 %e, sin1 %e\n",2*f+1,(double) &sin_lut128[0],(double) &sin_lut128[1]);
}
}
for (l=0; l<(int)desc->nb_taps; l++)
......@@ -226,10 +209,6 @@ int init_freq_channel_SSE_float(channel_desc_t *desc,uint16_t nb_rb,int16_t n_sa
sin_lut128=_mm_set_ps(sin(twopi*(4*f)*delay),sin(twopi*(4*f-1)*delay),sin(twopi*(4*f-2)*delay),sin(twopi*(4*f-3)*delay));
_mm_storeu_ps(&cos_lut_f[l][4*f-3+(n_samples>>1)],cos_lut128);
_mm_storeu_ps(&sin_lut_f[l][4*f-3+(n_samples>>1)],sin_lut128);
//cos_lut[f+(n_samples>>1)][l] = cos(2*M_PI*freq*delay);
//sin_lut[f+(n_samples>>1)][l] = sin(2*M_PI*freq*delay);
//printf("values cos:%d, sin:%d\n", cos_lut[f][l], sin_lut[f][l]);
}
}
/*for (f=-(n_samples>>1); f<=(n_samples>>1); f++) {
......@@ -428,65 +407,7 @@ int init_freq_channel_prach_SSE_float(channel_desc_t *desc,uint16_t nb_rb,int16_
return(0);
}
/*#ifdef abstraction_SSE
int freq_channel_prach(channel_desc_t *desc,uint16_t nb_rb,int16_t n_samples,int16_t prach_fmt,int16_t n_ra_prb)
{
int16_t f;
uint8_t aarx,aatx,l;
double *clut,*slut;
int prach_samples;
static int freq_channel_init=0;
static int n_samples_max=0;
__m128d clut128,slut128,chFx_128,chFy_128;
prach_samples = (prach_fmt<4)?13+839+12:3+139+2;
// do some error checking
if (nb_rb-n_ra_prb<6) {
fprintf(stderr, "freq_channel_init: Impossible to allocate PRACH, check r_ra_prb value (r_ra_prb=%d)\n",n_ra_prb);
return(-1);
}
if (freq_channel_init == 0) {
// we are initializing the lut for the largets possible n_samples=12*nb_rb+1
// if called with n_samples<12*nb_rb+1, we decimate the lut
n_samples_max=12*nb_rb+1;
if (init_freq_channel_prach(desc,nb_rb,n_samples_max,prach_fmt,n_ra_prb)==0)
freq_channel_init=1;
else
return(-1);
}
start_meas(&desc->interp_freq_PRACH);
for (f=0; f<(prach_samples>>1); f++) {
//clut = cos_lut[f];
//slut = sin_lut[f];
for (aarx=0; aarx<desc->nb_rx; aarx++) {
for (aatx=0; aatx<desc->nb_tx; aatx++) {
//desc->chF_prach[aarx+(aatx*desc->nb_rx)][f].x=0.0;
//desc->chF_prach[aarx+(aatx*desc->nb_rx)][f].y=0.0;
chFx_128=_mm_setzero_pd();
chFy_128=_mm_setzero_pd();
for (l=0; l<(int)desc->nb_taps; l++) {
//desc->chF_prach[aarx+(aatx*desc->nb_rx)][f].x+=(desc->a[l][aarx+(aatx*desc->nb_rx)].x*clut[l]+
// desc->a[l][aarx+(aatx*desc->nb_rx)].y*slut[l]);
//desc->chF_prach[aarx+(aatx*desc->nb_rx)][f].y+=(-desc->a[l][aarx+(aatx*desc->nb_rx)].x*slut[l]+
// desc->a[l][aarx+(aatx*desc->nb_rx)].y*clut[l]);
chFx_128=_mm_add_pd(chFx_128,_mm_add_pd(_mm_mul_pd(_mm_set1_pd(desc->a[l][aarx+(aatx*desc->nb_rx)].x),_mm_loadu_pd(&cos_lut[2*f][l])),_mm_mul_pd(_mm_set1_pd(desc->a[l][aarx+(aatx*desc->nb_rx)].y),_mm_loadu_pd(&sin_lut[2*f][l]))));
chFy_128=_mm_add_pd(chFy_128,_mm_sub_pd(_mm_mul_pd(_mm_set1_pd(desc->a[l][aarx+(aatx*desc->nb_rx)].y),_mm_loadu_pd(&cos_lut[2*f][l])),_mm_mul_pd(_mm_set1_pd(desc->a[l][aarx+(aatx*desc->nb_rx)].x),_mm_loadu_pd(&sin_lut[2*f][l]))));
}
_mm_storeu_pd(&desc->chF_prach[aarx+(aatx*desc->nb_rx)][2*f].x,chFx_128);
_mm_storeu_pd(&desc->chF_prach[aarx+(aatx*desc->nb_rx)][2*f].y,chFy_128);
}
}
//if (f<10 || (f>829&&f<839))
// printf("chF_prach[0][%d], (x,y) = (%e,%e)\n",f,desc->chF_prach[0][f].x,desc->chF_prach[0][f].y);
}
stop_meas(&desc->interp_freq_PRACH);
return(0);
}
#else*/
int freq_channel_prach(channel_desc_t *desc,uint16_t nb_rb,int16_t n_samples,int16_t prach_fmt,int16_t n_ra_prb)
{
......@@ -940,7 +861,79 @@ __m128 log_ps(__m128 x) {
x = _mm_add_ps(x, y);
x = _mm_add_ps(x, tmp);
x = _mm_or_ps(x, invalid_mask); // negative arg will be NAN
//printf("ln is %e,%e,%e,%e\n",x[0],x[1],x[2],x[3]);
return x;
}
/*#ifdef abstraction_SSE
int freq_channel_prach(channel_desc_t *desc,uint16_t nb_rb,int16_t n_samples,int16_t prach_fmt,int16_t n_ra_prb)
{
int16_t f;
uint8_t aarx,aatx,l;
double *clut,*slut;
int prach_samples;
static int freq_channel_init=0;
static int n_samples_max=0;
__m128d clut128,slut128,chFx_128,chFy_128;
prach_samples = (prach_fmt<4)?13+839+12:3+139+2;
// do some error checking
if (nb_rb-n_ra_prb<6) {
fprintf(stderr, "freq_channel_init: Impossible to allocate PRACH, check r_ra_prb value (r_ra_prb=%d)\n",n_ra_prb);
return(-1);
}
if (freq_channel_init == 0) {
// we are initializing the lut for the largets possible n_samples=12*nb_rb+1
// if called with n_samples<12*nb_rb+1, we decimate the lut
n_samples_max=12*nb_rb+1;
if (init_freq_channel_prach(desc,nb_rb,n_samples_max,prach_fmt,n_ra_prb)==0)
freq_channel_init=1;
else
return(-1);
}
start_meas(&desc->interp_freq_PRACH);
for (f=0; f<(prach_samples>>1); f++) {
//clut = cos_lut[f];
//slut = sin_lut[f];
for (aarx=0; aarx<desc->nb_rx; aarx++) {
for (aatx=0; aatx<desc->nb_tx; aatx++) {
//desc->chF_prach[aarx+(aatx*desc->nb_rx)][f].x=0.0;
//desc->chF_prach[aarx+(aatx*desc->nb_rx)][f].y=0.0;
chFx_128=_mm_setzero_pd();
chFy_128=_mm_setzero_pd();
for (l=0; l<(int)desc->nb_taps; l++) {
//desc->chF_prach[aarx+(aatx*desc->nb_rx)][f].x+=(desc->a[l][aarx+(aatx*desc->nb_rx)].x*clut[l]+
// desc->a[l][aarx+(aatx*desc->nb_rx)].y*slut[l]);
//desc->chF_prach[aarx+(aatx*desc->nb_rx)][f].y+=(-desc->a[l][aarx+(aatx*desc->nb_rx)].x*slut[l]+
// desc->a[l][aarx+(aatx*desc->nb_rx)].y*clut[l]);
chFx_128=_mm_add_pd(chFx_128,_mm_add_pd(_mm_mul_pd(_mm_set1_pd(desc->a[l][aarx+(aatx*desc->nb_rx)].x),_mm_loadu_pd(&cos_lut[2*f][l])),_mm_mul_pd(_mm_set1_pd(desc->a[l][aarx+(aatx*desc->nb_rx)].y),_mm_loadu_pd(&sin_lut[2*f][l]))));
chFy_128=_mm_add_pd(chFy_128,_mm_sub_pd(_mm_mul_pd(_mm_set1_pd(desc->a[l][aarx+(aatx*desc->nb_rx)].y),_mm_loadu_pd(&cos_lut[2*f][l])),_mm_mul_pd(_mm_set1_pd(desc->a[l][aarx+(aatx*desc->nb_rx)].x),_mm_loadu_pd(&sin_lut[2*f][l]))));
}
_mm_storeu_pd(&desc->chF_prach[aarx+(aatx*desc->nb_rx)][2*f].x,chFx_128);
_mm_storeu_pd(&desc->chF_prach[aarx+(aatx*desc->nb_rx)][2*f].y,chFy_128);
}
}
//if (f<10 || (f>829&&f<839))
// printf("chF_prach[0][%d], (x,y) = (%e,%e)\n",f,desc->chF_prach[0][f].x,desc->chF_prach[0][f].y);
}
stop_meas(&desc->interp_freq_PRACH);
return(0);
}
#else*/
//cos_lut[f+(n_samples>>1)][l] = cos(2*M_PI*freq*delay);
//sin_lut[f+(n_samples>>1)][l] = sin(2*M_PI*freq*delay);
//printf("cos %e,%e,%e,%e\n",cos_lut128[0],cos_lut128[1],cos_lut128[2],cos_lut128[3]);
//printf("sin %e,%e,%e,%e\n",sin_lut128[0],sin_lut128[1],sin_lut128[2],sin_lut128[3]);
//printf("arg %e, f %d, values cos:%e, sin:%e, cos# %e, sin# %e\n",twopi*(4*f)*delay,4*f+(n_samples>>1), cos_lut_f[l][4*f+(n_samples>>1)], sin_lut_f[l][4*f+(n_samples>>1)],cos(twopi*(4*f)*delay),sin(twopi*(4*f)*delay));
//printf("arg %e, f %d, values cos:%e, sin:%e, cos# %e, sin# %e\n",twopi*(4*f+1)*delay,4*f+1+(n_samples>>1), cos_lut_f[l][4*f+1+(n_samples>>1)], sin_lut_f[l][4*f+1+(n_samples>>1)],cos(twopi*(4*f+1)*delay),sin(twopi*(4*f+1)*delay));
//printf("arg %e, f %d, values cos:%e, sin:%e, cos# %e, sin# %e\n",twopi*(4*f+2)*delay,4*f+2+(n_samples>>1), cos_lut_f[l][4*f+2+(n_samples>>1)], sin_lut_f[l][4*f+2+(n_samples>>1)],cos(twopi*(4*f+2)*delay),sin(twopi*(4*f+2)*delay));
//printf("arg %e, f %d, values cos:%e, sin:%e, cos# %e, sin# %e\n",twopi*(4*f+3)*delay,4*f+3+(n_samples>>1), cos_lut_f[l][4*f+3+(n_samples>>1)], sin_lut_f[l][4*f+3+(n_samples>>1)],cos(twopi*(4*f+3)*delay),sin(twopi*(4*f+3)*delay));
//printf("f %d, cos0 %e, cos1 %e\n",2*f,(double) &cos_lut128[0],(double) &cos_lut128[1]);
//printf("f %d, sin0 %e, sin1 %e\n",2*f+1,(double) &sin_lut128[0],(double) &sin_lut128[1]);
......@@ -89,8 +89,8 @@ void fill_channel_desc(channel_desc_t *chan_desc,
chan_desc->max_Doppler = max_Doppler;
chan_desc->ch = (struct complex**) malloc(nb_tx*nb_rx*sizeof(struct complex*));
chan_desc->chF = (struct complex**) malloc(nb_tx*nb_rx*sizeof(struct complex*));
chan_desc->chFf = (struct complexf*) malloc(nb_tx*nb_rx*sizeof(struct complexf));
chan_desc->chF_prach = (struct complexf*) malloc(nb_tx*nb_rx*sizeof(struct complexf));
chan_desc->chFf = (struct complexf*) malloc(nb_tx*nb_rx*sizeof(struct complexf));//To allow SIMD intrinsic functions
chan_desc->chF_prach = (struct complexf*) malloc(nb_tx*nb_rx*sizeof(struct complexf));//To allow SIMD intrinsic functions
chan_desc->a = (struct complex**) malloc(nb_taps*sizeof(struct complex*));
LOG_D(OCM,"[CHANNEL] Filling ch \n");
......@@ -303,8 +303,8 @@ channel_desc_t *new_channel_desc_scm(uint8_t nb_tx,
chan_desc->random_aoa = 0;
chan_desc->ch = (struct complex**) malloc(nb_tx*nb_rx*sizeof(struct complex*));
chan_desc->chF = (struct complex**) malloc(nb_tx*nb_rx*sizeof(struct complex*));
chan_desc->chFf = (struct complexf*) malloc(nb_tx*nb_rx*sizeof(struct complexf));
chan_desc->chF_prach = (struct complexf*) malloc(nb_tx*nb_rx*sizeof(struct complexf));
chan_desc->chFf = (struct complexf*) malloc(nb_tx*nb_rx*sizeof(struct complexf));//To allow SIMD intrinsic functions
chan_desc->chF_prach = (struct complexf*) malloc(nb_tx*nb_rx*sizeof(struct complexf));//To allow SIMD intrinsic functions
chan_desc->a = (struct complex**) malloc(chan_desc->nb_taps*sizeof(struct complex*));
for (i = 0; i<nb_tx*nb_rx; i++)
chan_desc->ch[i] = (struct complex*) malloc(chan_desc->channel_length * sizeof(struct complex));
......@@ -358,8 +358,8 @@ channel_desc_t *new_channel_desc_scm(uint8_t nb_tx,
chan_desc->random_aoa = 0;
chan_desc->ch = (struct complex**) malloc(nb_tx*nb_rx*sizeof(struct complex*));
chan_desc->chF = (struct complex**) malloc(nb_tx*nb_rx*sizeof(struct complex*));
chan_desc->chFf = (struct complexf*) malloc(nb_tx*nb_rx*sizeof(struct complexf));
chan_desc->chF_prach = (struct complexf*) malloc(nb_tx*nb_rx*sizeof(struct complexf));
chan_desc->chFf = (struct complexf*) malloc(nb_tx*nb_rx*sizeof(struct complexf));//To allow SIMD intrinsic functions
chan_desc->chF_prach = (struct complexf*) malloc(nb_tx*nb_rx*sizeof(struct complexf));//To allow SIMD intrinsic functions
chan_desc->a = (struct complex**) malloc(chan_desc->nb_taps*sizeof(struct complex*));
for (i = 0; i<nb_tx*nb_rx; i++)
chan_desc->ch[i] = (struct complex*) malloc(chan_desc->channel_length * sizeof(struct complex));
......@@ -412,8 +412,8 @@ channel_desc_t *new_channel_desc_scm(uint8_t nb_tx,
chan_desc->random_aoa = 0;
chan_desc->ch = (struct complex**) malloc(nb_tx*nb_rx*sizeof(struct complex*));
chan_desc->chF = (struct complex**) malloc(nb_tx*nb_rx*sizeof(struct complex*));
chan_desc->chFf = (struct complexf*) malloc(nb_tx*nb_rx*sizeof(struct complexf));
chan_desc->chF_prach = (struct complexf*) malloc(nb_tx*nb_rx*sizeof(struct complexf));
chan_desc->chFf = (struct complexf*) malloc(nb_tx*nb_rx*sizeof(struct complexf));//To allow SIMD intrinsic functions
chan_desc->chF_prach = (struct complexf*) malloc(nb_tx*nb_rx*sizeof(struct complexf));//To allow SIMD intrinsic functions
chan_desc->a = (struct complex**) malloc(chan_desc->nb_taps*sizeof(struct complex*));
for (i = 0; i<nb_tx*nb_rx; i++)
chan_desc->ch[i] = (struct complex*) malloc(chan_desc->channel_length * sizeof(struct complex));
......@@ -458,8 +458,8 @@ channel_desc_t *new_channel_desc_scm(uint8_t nb_tx,
chan_desc->random_aoa = 0;
chan_desc->ch = (struct complex**) malloc(nb_tx*nb_rx*sizeof(struct complex*));
chan_desc->chF = (struct complex**) malloc(nb_tx*nb_rx*sizeof(struct complex*));
chan_desc->chFf = (struct complexf*) malloc(nb_tx*nb_rx*sizeof(struct complexf));
chan_desc->chF_prach = (struct complexf*) malloc(nb_tx*nb_rx*sizeof(struct complexf));
chan_desc->chFf = (struct complexf*) malloc(nb_tx*nb_rx*sizeof(struct complexf));//To allow SIMD intrinsic functions
chan_desc->chF_prach = (struct complexf*) malloc(nb_tx*nb_rx*sizeof(struct complexf));//To allow SIMD intrinsic functions
chan_desc->a = (struct complex**) malloc(chan_desc->nb_taps*sizeof(struct complex*));
for (i = 0; i<nb_tx*nb_rx; i++)
chan_desc->ch[i] = (struct complex*) malloc(chan_desc->channel_length * sizeof(struct complex));
......@@ -507,8 +507,8 @@ channel_desc_t *new_channel_desc_scm(uint8_t nb_tx,
chan_desc->random_aoa = 0;
chan_desc->ch = (struct complex**) malloc(nb_tx*nb_rx*sizeof(struct complex*));
chan_desc->chF = (struct complex**) malloc(nb_tx*nb_rx*sizeof(struct complex*));
chan_desc->chFf = (struct complexf*) malloc(nb_tx*nb_rx*sizeof(struct complexf));
chan_desc->chF_prach = (struct complexf*) malloc(nb_tx*nb_rx*sizeof(struct complexf));
chan_desc->chFf = (struct complexf*) malloc(nb_tx*nb_rx*sizeof(struct complexf));//To allow SIMD intrinsic functions
chan_desc->chF_prach = (struct complexf*) malloc(nb_tx*nb_rx*sizeof(struct complexf));//To allow SIMD intrinsic functions
chan_desc->a = (struct complex**) malloc(chan_desc->nb_taps*sizeof(struct complex*));
for (i = 0; i<nb_tx*nb_rx; i++)
chan_desc->ch[i] = (struct complex*) malloc(chan_desc->channel_length * sizeof(struct complex));
......@@ -556,8 +556,8 @@ channel_desc_t *new_channel_desc_scm(uint8_t nb_tx,
chan_desc->random_aoa = 0;
chan_desc->ch = (struct complex**) malloc(nb_tx*nb_rx*sizeof(struct complex*));
chan_desc->chF = (struct complex**) malloc(nb_tx*nb_rx*sizeof(struct complex*));
chan_desc->chFf = (struct complexf*) malloc(nb_tx*nb_rx*sizeof(struct complexf));
chan_desc->chF_prach = (struct complexf*) malloc(nb_tx*nb_rx*sizeof(struct complexf));
chan_desc->chFf = (struct complexf*) malloc(nb_tx*nb_rx*sizeof(struct complexf));//To allow SIMD intrinsic functions
chan_desc->chF_prach = (struct complexf*) malloc(nb_tx*nb_rx*sizeof(struct complexf));//To allow SIMD intrinsic functions
chan_desc->a = (struct complex**) malloc(chan_desc->nb_taps*sizeof(struct complex*));
for (i = 0; i<nb_tx*nb_rx; i++)
chan_desc->ch[i] = (struct complex*) malloc(chan_desc->channel_length * sizeof(struct complex));
......@@ -604,8 +604,8 @@ channel_desc_t *new_channel_desc_scm(uint8_t nb_tx,
chan_desc->random_aoa = 0;
chan_desc->ch = (struct complex**) malloc(nb_tx*nb_rx*sizeof(struct complex*));
chan_desc->chF = (struct complex**) malloc(nb_tx*nb_rx*sizeof(struct complex*));
chan_desc->chFf = (struct complexf*) malloc(nb_tx*nb_rx*sizeof(struct complexf));
chan_desc->chF_prach = (struct complexf*) malloc(nb_tx*nb_rx*sizeof(struct complexf));
chan_desc->chFf = (struct complexf*) malloc(nb_tx*nb_rx*sizeof(struct complexf));//To allow SIMD intrinsic functions
chan_desc->chF_prach = (struct complexf*) malloc(nb_tx*nb_rx*sizeof(struct complexf));//To allow SIMD intrinsic functions
chan_desc->a = (struct complex**) malloc(chan_desc->nb_taps*sizeof(struct complex*));
for (i = 0; i<nb_tx*nb_rx; i++)
chan_desc->ch[i] = (struct complex*) malloc(chan_desc->channel_length * sizeof(struct complex));
......@@ -650,8 +650,8 @@ channel_desc_t *new_channel_desc_scm(uint8_t nb_tx,
chan_desc->random_aoa = 0;
chan_desc->ch = (struct complex**) malloc(nb_tx*nb_rx*sizeof(struct complex*));
chan_desc->chF = (struct complex**) malloc(nb_tx*nb_rx*sizeof(struct complex*));
chan_desc->chFf = (struct complexf*) malloc(nb_tx*nb_rx*sizeof(struct complexf));
chan_desc->chF_prach = (struct complexf*) malloc(nb_tx*nb_rx*sizeof(struct complexf));
chan_desc->chFf = (struct complexf*) malloc(nb_tx*nb_rx*sizeof(struct complexf));//To allow SIMD intrinsic functions
chan_desc->chF_prach = (struct complexf*) malloc(nb_tx*nb_rx*sizeof(struct complexf));//To allow SIMD intrinsic functions
chan_desc->a = (struct complex**) malloc(chan_desc->nb_taps*sizeof(struct complex*));
for (i = 0; i<nb_tx*nb_rx; i++)
chan_desc->ch[i] = (struct complex*) malloc(chan_desc->channel_length * sizeof(struct complex));
......@@ -696,8 +696,8 @@ channel_desc_t *new_channel_desc_scm(uint8_t nb_tx,
chan_desc->random_aoa = 0;
chan_desc->ch = (struct complex**) malloc(nb_tx*nb_rx*sizeof(struct complex*));
chan_desc->chF = (struct complex**) malloc(nb_tx*nb_rx*sizeof(struct complex*));
chan_desc->chFf = (struct complexf*) malloc(nb_tx*nb_rx*sizeof(struct complexf));
chan_desc->chF_prach = (struct complexf*) malloc(nb_tx*nb_rx*sizeof(struct complexf));
chan_desc->chFf = (struct complexf*) malloc(nb_tx*nb_rx*sizeof(struct complexf));//To allow SIMD intrinsic functions
chan_desc->chF_prach = (struct complexf*) malloc(nb_tx*nb_rx*sizeof(struct complexf));//To allow SIMD intrinsic functions
chan_desc->a = (struct complex**) malloc(chan_desc->nb_taps*sizeof(struct complex*));
for (i = 0; i<nb_tx*nb_rx; i++)
chan_desc->ch[i] = (struct complex*) malloc(chan_desc->channel_length * sizeof(struct complex));
......@@ -1275,8 +1275,8 @@ int random_channel(channel_desc_t *desc, uint8_t abstraction_flag) {
for (aarx=0;aarx<desc->nb_rx;aarx++) {
for (aatx=0;aatx<desc->nb_tx;aatx++) {
anew[aarx+(aatx*desc->nb_rx)].x = sqrt(desc->ricean_factor*desc->amps[i]/2) * ziggurat(0.0,1.0);
anew[aarx+(aatx*desc->nb_rx)].y = sqrt(desc->ricean_factor*desc->amps[i]/2) * ziggurat(0.0,1.0);
anew[aarx+(aatx*desc->nb_rx)].x = sqrt(desc->ricean_factor*desc->amps[i]/2) * ziggurat(0.0,1.0);// Ziggurat function to improve pseudo-random normal number generation.
anew[aarx+(aatx*desc->nb_rx)].y = sqrt(desc->ricean_factor*desc->amps[i]/2) * ziggurat(0.0,1.0);// Ziggurat function to improve pseudo-random normal number generation.
if ((i==0) && (desc->ricean_factor != 1.0)) {
if (desc->random_aoa==1) {
......@@ -1422,8 +1422,8 @@ int random_channel_freq(channel_desc_t *desc, uint8_t abstraction_flag) {
for (aarx=0;aarx<desc->nb_rx;aarx++) {
for (aatx=0;aatx<desc->nb_tx;aatx++) {
anew[aarx+(aatx*desc->nb_rx)].x = sqrt(desc->ricean_factor*desc->amps[i]/2) * ziggurat(0.0,1.0);
anew[aarx+(aatx*desc->nb_rx)].y = sqrt(desc->ricean_factor*desc->amps[i]/2) * ziggurat(0.0,1.0);
anew[aarx+(aatx*desc->nb_rx)].x = sqrt(desc->ricean_factor*desc->amps[i]/2) * ziggurat(0.0,1.0);// Ziggurat function to improve pseudo-random normal number generation.
anew[aarx+(aatx*desc->nb_rx)].y = sqrt(desc->ricean_factor*desc->amps[i]/2) * ziggurat(0.0,1.0);// Ziggurat function to improve pseudo-random normal number generation.
if ((i==0) && (desc->ricean_factor != 1.0)) {
if (desc->random_aoa==1) {
......
......@@ -17,6 +17,17 @@
*-------------------------------------------------------------------------------
* For more information about the OpenAirInterface (OAI) Software Alliance:
* contact@openairinterface.org
*-------------------------------------------------------------------------------
* Optimization using SIMD instructions
* Frecuency Domain Analysis
* Luis Felipe Ariza Vesga, email:lfarizav@unal.edu.co
* Functions: SHR3, UNI, NOR, nfix(), table_nor(), Ziggurat() -->"The Ziggurat-
* Method for Generating Random Variables", G. Marsaglia, W. W. Tsang.
* Functions: SHR3_SSE, UNI_SSE, NOR_SSE, nfix_SSE() --> SSE versions of Ziggurat Method.
* Functions: boxmuller_SSE_float () --> Box-Muller pseudo-random normal-
* number generation modificated version from Miloyip and Cephes sources.
* More info https://github.com/miloyip/normaldist-benchmark.
*-------------------------------------------------------------------------------
*/
#ifdef USER_MODE
......@@ -42,43 +53,6 @@ static unsigned int seed, iy, ir[98];
#define mod 4294967296.0 /* is 2**32 */
#if 1
//#define randominit_SSE
#ifdef randominit_SSE
void randominit(unsigned seed_init)
{
int i;
// this need to be integrated with the existing rng, like taus: navid
msg("Initializing random number generator, seed %x\n",seed_init);
if (seed_init == 0) {
srand((unsigned)time(NULL));
seed_array[0] = (unsigned int) rand();
seed_array[1] = (unsigned int) rand();
seed_array[2] = (unsigned int) rand();
seed_array[3] = (unsigned int) rand();
seed = (unsigned int) rand();
} else {
seed = seed_init;
seed_array[0] = seed_init;
seed_array[1] = log(seed_init);
seed_array[2] = pow(seed_init,3);
seed_array[3] = sqrt(seed_init);
}
if (seed % 2 == 0) seed += 1; /* seed and mod are relative prime */
for (i=0;i<4;i++)
if (seed_array[i] % 2 == 0) seed_array[i] += 1; /* seed and mod are relative prime */
for (i=1; i<4*98; i++) { /*4 times 98 to use in SSE implementations*/
seed_array[i%4] = a*seed_array[i%4]; /* mod 2**32 */
ir[i]= seed_array[i%4]; /* initialize the shuffle table */
}
for ( i = 0; i < 4*98; i++ ){
printf("ir[%d]: %d\n",i,ir[i]);
}
iy=1;
}
#else
void randominit(unsigned seed_init)
{
int i;
......@@ -102,7 +76,6 @@ void randominit(unsigned seed_init)
iy=1;
}
#endif
#endif
/*!\brief Uniform linear congruential random number generator on \f$[0,1)\f$. Returns a double-precision floating-point number.*/
......@@ -115,112 +88,81 @@ double uniformrandom(void)
ir[j] = seed;
return( (double) iy/mod );
}
/*void uniformrandomSSE(__m128d *d1,__m128d *d2)
{
int i,j;
__m128d u_rand128;
__m128i j128;
int j_array[]={(int) (1 + 9.103678167e-08*iy_array[0]),(int) (1 + 9.103678167e-08*iy_array[1]),(int) (1 + 9.103678167e-08*iy_array[2]),(int) (1 + 9.103678167e-08*iy_array[3])};
//j128=_mm_setr_epi32(1 + 391.0*iy_array[0]/mod,1 + 391.0*iy_array[1]/mod,1 + 391.0*iy_array[2]/mod,1 + 391.0*iy_array[3]/mod);
//j128=_mm_setr_epi32(391,391,391,391);
//j128=_mm_mul_epu32(j128,_mm_setr_epi32(iy_array[0],iy_array[1],iy_array[2],iy_array[3]));
//j128=_mm_slli_epi32(j128,32);
//j=1 + 97.0*iy/mod;
iy_array[0]=ir[j_array[0]];
iy_array[1]=ir[j_array[1]];
iy_array[2]=ir[j_array[2]];
iy_array[3]=ir[j_array[3]];
seed_array[0] = a*seed_array[0];
seed_array[1] = a*seed_array[1];
seed_array[2] = a*seed_array[2];
seed_array[3] = a*seed_array[3];
ir[j_array[0]] = seed_array[0];
ir[j_array[1]] = seed_array[1];
ir[j_array[2]] = seed_array[2];
ir[j_array[3]] = seed_array[3];
*d1=_mm_setr_pd (2*((double) iy_array[0]/mod)-1, 2*((double) iy_array[1])-1);
*d2=_mm_setr_pd (2*((double) iy_array[2]/mod)-1, 2*((double) iy_array[3])-1);
//return ((double) iy/mod );
return;
}*/
/*!\brief Ziggurat random number generator based on rejection sampling. It returns a pseudorandom normally distributed double number with x=0 and variance=1*/
//Procedure to create tables for normal distribution kn,wn and fn
/*!\brief Ziggurat random number generator based on rejection sampling. It returns a pseudo-random normally distributed number with x=0 and variance=1*/
static double wn[128],fn[128];
static uint32_t iz,jz,jsr=123456789,kn[128];
static int32_t hz;
//#define SHR3 (jz=jsr,printf("jsr %lu ",jsr), jsr^=(jsr<<13),printf("jsr<<13 %lu ",jsr),jsr^=(jsr>>17),printf("jsr>>17 %lu ",jsr),jsr^=(jsr<<5),printf("jsr<<5 %lu\n",jsr),printf("SHR3 jsr %lu, jz %lu, jsr+jz %lu\n",jsr,jz,jz+jsr),jz+jsr)
#define SHR3 (jz=jsr, jsr^=(jsr<<13),jsr^=(jsr>>17),jsr^=(jsr<<5),jz+jsr)
#define UNI (0.5+(signed) SHR3 * 0.2328306e-9)
#define NOR (hz=SHR3,iz=(hz&127),(abs(hz)<kn[iz])? hz*wn[iz] : nfix())
static double wn[128],fn[128];
static uint32_t iz,jz,jsr=123456789,kn[128];
static int32_t hz;
static uint32_t jsr4[4] __attribute__((aligned(16))) = {123456789,112548569,985584512,452236879};//This initialization depends on the seed for nor_table function in oaisim_functions.c file.
static uint32_t iz4[4] __attribute__((aligned(16)));
static float out[4] __attribute__((aligned(16)));
static uint32_t ssh3_sse4[4] __attribute__((aligned(16)));
static uint32_t ifabs4[4] __attribute__((aligned(16)));
static uint32_t hz4[4] __attribute__((aligned(16)));
#if defined(__x86_64__) || defined(__i386__)
static int32_t ifabs4[4] __attribute__((aligned(16)));
static int32_t hz4[4] __attribute__((aligned(16)));
static int32_t abshz4[4] __attribute__((aligned(16)));
static float x4[4] __attribute__((aligned(16)));
static float y4[4] __attribute__((aligned(16)));
static __m128i jsr_128 __attribute__((aligned(16)));
static __m128i jz_128 __attribute__((aligned(16)));
static __m128i hz_128 __attribute__((aligned(16)));
static __m128i abs_hz_128 __attribute__((aligned(16)));
static __m128i iz_128 __attribute__((aligned(16)));
static __m128 x128 __attribute__((aligned(16)));
static __m128i ifabs __attribute__((aligned(16)));
//#define jsr4 (jz=jsr, jsr^=(jsr<<13),jsr^=(jsr>>17),jsr^=(jsr<<5),jz+jsr,printf("seed %d, next seed %d\n",jz,jsr))
//#define SHR3_SSE (jsr_128=_mm_loadu_si128(jsr4),jz_128=jsr_128, printf("jsr4 is %lu,%lu,%lu,%lu\n",jsr4[0],jsr4[1],jsr4[2],jsr4[3]), jsr_128=_mm_xor_si128(_mm_slli_epi32(jsr_128,13),jsr_128),_mm_storeu_si128((__m128i *)jsr4,jsr_128),printf("jsr128<<13 is %lu,%lu,%lu,%lu\n",jsr4[0],jsr4[1],jsr4[2],jsr4[3]), jsr_128=_mm_xor_si128(_mm_srli_epi32(jsr_128,17),jsr_128),_mm_storeu_si128((__m128i *)jsr4,jsr_128),printf("jsr128>>17 is %lu,%lu,%lu,%lu\n",jsr4[0],jsr4[1],jsr4[2],jsr4[3]), jsr_128=_mm_xor_si128(_mm_slli_epi32(jsr_128,5),jsr_128),_mm_storeu_si128((__m128i *)jsr4,jsr_128), printf("jsr128<<5 is %lu,%lu,%lu,%lu\n",jsr4[0],jsr4[1],jsr4[2],jsr4[3]), _mm_storeu_si128(out,_mm_add_epi32(jz_128,jsr_128)),printf("out is %lu,%lu,%lu,%lu\n",out[0],out[1],out[2],out[3]),_mm_add_epi32(jz_128,jsr_128))
#define SHR3_SSE (jsr_128=_mm_loadu_si128((__m128i *)jsr4),jz_128=jsr_128, jsr_128=_mm_xor_si128(_mm_slli_epi32(jsr_128,13),jsr_128),jsr_128=_mm_xor_si128(_mm_srli_epi32(jsr_128,17),jsr_128),jsr_128=_mm_xor_si128(_mm_slli_epi32(jsr_128,5),jsr_128),_mm_storeu_si128((__m128i *)jsr4,jsr_128),_mm_add_epi32(jz_128,jsr_128))
//#define UNI_SSE (_mm_storeu_ps(out,_mm_add_ps(_mm_mul_ps(_mm_set1_ps(0.2328306e-9),_mm_cvtepi32_ps(SHR3_SSE)),_mm_set1_ps(0.5))),printf("out is %e,%e,%e,%e\n",out[0],out[1],out[2],out[3]),_mm_add_ps(_mm_mul_ps(_mm_set1_ps(0.2328306e-9),_mm_cvtepi32_ps(SHR3_SSE)),_mm_set1_ps(0.5)))
#define SHR3_SSE (jsr_128=_mm_loadu_si128((__m128i *)jsr4),jz_128=jsr_128, jsr_128=_mm_xor_si128(_mm_slli_epi32(jsr_128,13),jsr_128),jsr_128=_mm_xor_si128(_mm_srli_epi32(jsr_128,17),jsr_128),jsr_128=_mm_xor_si128(_mm_slli_epi32(jsr_128,5),jsr_128),_mm_storeu_si128((__m128i *)jsr4,jsr_128),_mm_add_epi32(jz_128,jsr_128))
#define UNI_SSE (_mm_add_ps(_mm_mul_ps(_mm_set1_ps(0.2328306e-9),_mm_cvtepi32_ps(SHR3_SSE)),_mm_set1_ps(0.5)))
//,_mm_storeu_si128(ssh3_sse4,hz_128),printf("ssh3_sse4 %lu,%lu,%lu,%lu\n",ssh3_sse4[0],ssh3_sse4[1],ssh3_sse4[2],ssh3_sse4[3])
#define NOR_SSE (hz_128=SHR3_SSE,_mm_storeu_si128(hz4,hz_128),iz_128=_mm_and_si128(hz_128,_mm_set1_epi32(127)),_mm_storeu_si128(iz4,iz_128),ifabs=_mm_cmplt_epi32(_mm_max_epi32(_mm_sub_epi32(_mm_setzero_si128(),hz_128),hz_128),_mm_setr_epi32(kn[iz4[0]],kn[iz4[1]],kn[iz4[2]],kn[iz4[3]])),_mm_storeu_si128(ifabs4,ifabs),printf("ifabs %x,%x,%x,%x\n",ifabs4[0],ifabs4[1],ifabs4[2],ifabs4[3]),x128=_mm_and_ps(_mm_cvtepi32_ps(_mm_cmplt_epi32(_mm_max_epi32(_mm_sub_epi32(_mm_setzero_si128(),hz_128),hz_128),_mm_setr_epi32(kn[iz4[0]],kn[iz4[1]],kn[iz4[2]],kn[iz4[3]]))),_mm_mul_ps(_mm_cvtepi32_ps(hz_128),_mm_setr_ps(wn[iz4[0]],wn[iz4[1]],wn[iz4[2]],wn[iz4[3]]))),printf("x128 %e,%e,%e,%e\n",x128[0],x128[1],x128[2],x128[3]),printf("iz %d,%d,%d,%d\n",iz4[0],iz4[1],iz4[2],iz4[3]),printf("wn*hz %e,%e,%e,%e\n",hz4[0]*wn[iz4[0]],hz4[1]*wn[iz4[1]],hz4[2]*wn[iz4[2]],hz4[3]*wn[iz4[3]]),_mm_set1_ps(-1,2,3,4))
#endif
//
//
//
//
//
#define NOR_SSE (hz_128=SHR3_SSE,_mm_storeu_si128((__m128i *)hz4,hz_128),iz_128=_mm_and_si128(hz_128,_mm_set1_epi32(127)),_mm_storeu_si128((__m128i *)iz4,iz_128),ifabs=_mm_cmplt_epi32(_mm_max_epi32(_mm_sub_epi32(_mm_setzero_si128(),hz_128),hz_128),_mm_setr_epi32(kn[iz4[0]],kn[iz4[1]],kn[iz4[2]],kn[iz4[3]])),_mm_storeu_si128((__m128i *)ifabs4,ifabs),abs_hz_128=_mm_and_si128(hz_128, _mm_set1_epi32(~0x80000000)),_mm_storeu_si128((__m128i *)abshz4,abs_hz_128),printf("abs_hz_128 %d,%d,%d,%d\n",abshz4[0],abshz4[1],abshz4[2],abshz4[3]),printf("kn %d,%d,%d,%d\n",kn[iz4[0]],kn[iz4[1]],kn[iz4[2]],kn[iz4[3]]),printf("ifabs %x,%x,%x,%x\n",ifabs4[0],ifabs4[1],ifabs4[2],ifabs4[3]),x128=_mm_and_ps(_mm_cvtepi32_ps(_mm_cmplt_epi32(_mm_max_epi32(_mm_sub_epi32(_mm_setzero_si128(),hz_128),hz_128),_mm_setr_epi32(kn[iz4[0]],kn[iz4[1]],kn[iz4[2]],kn[iz4[3]]))),_mm_mul_ps(_mm_cvtepi32_ps(hz_128),_mm_setr_ps(wn[iz4[0]],wn[iz4[1]],wn[iz4[2]],wn[iz4[3]]))),printf("x128 %e,%e,%e,%e\n",x128[0],x128[1],x128[2],x128[3]),printf("iz %d,%d,%d,%d\n",iz4[0],iz4[1],iz4[2],iz4[3]),printf("wn*hz %e,%e,%e,%e\n",hz4[0]*wn[iz4[0]],hz4[1]*wn[iz4[1]],hz4[2]*wn[iz4[2]],hz4[3]*wn[iz4[3]]),nfix_SSE)
//,_mm_storeu_si128(ssh3_sse4,hz_128),printf("ssh3_sse4 %lu,%lu,%lu,%lu\n",ssh3_sse4[0],ssh3_sse4[1],ssh3_sse4[2],ssh3_sse4[3])
//#define NOR (hz=SHR3, printf("hz %d\n",hz),sign=(hz&128)>>7,printf("sign %s\n",(sign)?"-":"+"),iz=hz&127,printf("iz %d\n",iz), (abs(hz)<kn[iz])? (sign)?(-1)*hz*wn[iz]:hz*wn[iz] : (sign)?(-1)*nfix():nfix())
/*__m128 nfix_SSE(void)
{
if (ifabs4[0]==ifabs4[1]==ifabs4[2]==ifabs4[3]==0xFFFFFFFF)
return _mm_mul_ps(_mm_cvtepi32_ps(hz_128),_mm_setr_ps(wn[iz4[0]],wn[iz4[1]],wn[iz4[2]],wn[iz4[3]]));
__m128 r = _mm_set1_ps(3.442620);
static __m128 x, y;
static iz0=0;
for (;;)
{
x= x128;
//printf("iz %d,x %e,l %d, hz %d,d %e wn %e\n",iz, x, l, hz, d, wn[iz]);
if (iz==0)
if (iz4[0]==0||iz4[1]==0||iz4[2]==0||iz4[3]==0)
{
do
{
x = - 0.2904764 * log (UNI);
y = - log (UNI);
x = _mm_mul_ps(_mm_set1_ps(-0.2904764), log_ps(UNI_SSE));
_mm_storeu_ps((__m128 *)x4,x);
y = _mm_mul_ps(_mm_set1_ps(-1), log_ps(UNI_SSE));
_mm_storeu_ps((__m128 *)y4,y);
}
while (y+y < x*x);
while (_mm_add_ps(y,y) < _mm_mul_ps(x,x));
(iz0==10)?iz0=0:iz0++;
return (hz>0)? _mm_set1_ps(r+x) : _mm_set1_ps(-r-x);
}
if (fn[iz]+UNI*(fn[iz-1]-fn[iz])<exp(-0.5*x*x)){
(iz0==10)?iz0=0:iz0++;
return _mm_set1_ps(x);
}
hz = SHR3;
iz = hz&127;
if (abs(hz) < kn[iz]){
(iz0==10)?iz0=0:iz0++;
return (_mm_set1_ps((hz)*wn[iz]));
}
}
}*/
double nfix(void)
{
const double r = 3.442620;
......@@ -228,7 +170,6 @@ double nfix(void)
for (;;)
{
x=hz * wn[iz];
//printf("iz %d,x %e,l %d, hz %d,d %e wn %e\n",iz, x, l, hz, d, wn[iz]);
if (iz==0)
{
do
......@@ -237,27 +178,23 @@ double nfix(void)
y = - log (UNI);
}
while (y+y < x*x);
/*printf("x*x %e,y+y %e\n",x*x,y+y);
printf("return 1: %e\n",(hz>0)? r+x : -r-x);*/
return (hz>0)? r+x : -r-x;
}
if (fn[iz]+UNI*(fn[iz-1]-fn[iz])<exp(-0.5*x*x)){
//printf("return 2: %e\n",x);
return x;
}
hz = SHR3;
iz = hz&127;
if (abs(hz) < kn[iz]){
/*printf("return 3: %e\n",(double)hz*wn[iz]);
printf("return 3: iz %d, hz %d, wn[%d] %e, hz*wz[%d] %e\n",iz,hz,iz,wn[iz],iz,wn[iz]*(double)hz);*/
return ((hz)*wn[iz]);
}
}
}
/*!\Procedure to create tables for normal distribution kn,wn and fn. */
void table_nor(unsigned long seed)
{
jsr=seed;
//printf("Seed for Ziggurat random number generator is %d\n",seed);
double dn = 3.442619855899;
int i;
const double m1 = 2147483648.0;
......@@ -283,45 +220,33 @@ void table_nor(unsigned long seed)
fn[i] = (exp (-0.5*dn*dn));
wn[i] = (dn / m1);
}
/*for ( i = 0; i <= 127; i++ ){
printf("i %d: kn %d, fn %e, wn %e\n",i,kn[i],fn[i],wn[i]);
}*/
return;
}
double ziggurat(double mean, double variance)
{
/*__m128 n __attribute__((aligned(16)));
__m128 m __attribute__((aligned(16)));
//n=NOR_SSE;
m=log_ps(_mm_setr_ps(1230,100,340,-1000));
printf("log using functions (1230,100,340,-1000) %e,%e,%e,%e\n",log(1230),log(100),log(340),log(-1000));
printf("log sse (1230,100,340,-1000) %e,%e,%e,%e\n",m[0],m[1],m[2],m[3]);*/
//__m128 aa;
//aa=NOR_SSE;
return NOR;
}
double ziggurat_SSE_float(double mean, double variance)
{
return NOR;
}
void boxmuller_SSE_float(__m128 *data1, __m128 *data2) {
//assert(count % 8 == 0);
__m128 twopi = _mm_set1_ps(2.0f * 3.14159265358979323846f);
//__m128 one = _mm_set1_ps(1.0f);
__m128 minustwo = _mm_set1_ps(-2.0f);
__m128 u1_ps,u2_ps;
__m128 radius,theta,sintheta,costheta;
u1_ps = UNI_SSE;//_mm_sub_ps(_mm_mul_ps(_mm_set1_ps(2),UNI_SSE),_mm_set1_ps(-1));
u2_ps = UNI_SSE;//_mm_sub_ps(_mm_mul_ps(_mm_set1_ps(2),UNI_SSE),_mm_set1_ps(-1));
//_mm_storeu_ps(out,u1_ps);
//_mm_storeu_ps(out1,u2_ps);
//printf("NOR SSE is %e,%e,%e,%e\n",out[0],out[1],out[2],out[3]);
//printf("NOR SSE1 is %e,%e,%e,%e\n",out1[0],out1[1],out1[2],out1[3]);
u1_ps = UNI_SSE;
u2_ps = UNI_SSE;
radius = _mm_sqrt_ps(_mm_mul_ps(minustwo, log_ps(u1_ps)));
theta = _mm_mul_ps(twopi, u2_ps);
sincos_ps(theta, &sintheta, &costheta);
*data1=_mm_mul_ps(radius, costheta);
*data2=_mm_mul_ps(radius, sintheta);
//}
}
/*
@defgroup _gaussdouble Gaussian random number generator based on modified Box-Muller transformation.
......@@ -329,65 +254,9 @@ void boxmuller_SSE_float(__m128 *data1, __m128 *data2) {
*/
/*!\brief Gaussian random number generator based on modified Box-Muller transformation.Returns a double-precision floating-point number. */
//#define random_SSE
#ifdef random_SSE
double gaussdouble(double mean, double variance)
{
static int iset=0;
static float gset;
float fac,rn,r[2],v1[2],v2[2],ones[]={1.0,1.0,1.0,1.0};
static double max=-1000000;
static double min=1000000;
__m128 v1_128, v2_128, r128, ones128,compge128_mask;
ones128 = _mm_load_ps(ones);
if (iset == 0) {
do {
//v2 = 2.0*uniformrandom()-1.0;
v1_128 = _mm_set1_ps(2*UNI-1);
v2_128 = _mm_set1_ps(2*UNI-1);
//r = v1*v1+v2*v2;
r128= _mm_add_ps(_mm_mul_ps(v1_128,v1_128),_mm_mul_ps(v2_128,v2_128));
compge128_mask=_mm_cmpge_ps(r128,ones128);
//_mm_storeu_ps(&r[0],r128);
//printf("Inside do: r[0] %e, r[1] %e\n",r[0],r[1]);
} while (compge128_mask[0] != 4294967295 && compge128_mask[1]!=4294967295 && compge128_mask[2]!=4294967295 && compge128_mask[3]!=4294967295);
//printf("outside do: r[0] %e, r[1] %e\n",r[0],r[1]);
if (r[0]<r[1]){
fac = sqrt(-2.0*log(r[0])/r[0]);
gset= v1[0]*fac;
iset=1;
return(sqrt(variance)*v1[1]*fac + mean);
}
else{
fac = sqrt(-2.0*log(r[1])/r[1]);
gset= v2[0]*fac;
iset=1;
return(sqrt(variance)*v2[1]*fac + mean);
}
} else {
iset=0;
//printf("normal random number %e, max %e, min %e\n",sqrt(variance)*gset + mean, max,min);
if (max<sqrt(variance)*gset + mean)
max=sqrt(variance)*gset + mean;
if (min>sqrt(variance)*gset + mean)
min=sqrt(variance)*gset + mean;
return(sqrt(variance)*gset + mean);
}
}
#else
double gaussdouble(double mean, double variance)
{
/*static int first_run;
static double sum;
static int count;
if (!first_run)
{
first_run=1;
sum=0;
count=0;
} */
static int iset=0;
static double gset;
double fac,r,v1,v2;
......@@ -396,24 +265,16 @@ double gaussdouble(double mean, double variance)
if (iset == 0) {
do {
/*count++;
clock_t start=clock();*/
v1 = 2.0*UNI-1.0;
/*clock_t stop=clock();
printf("UE_freq_channel time is %f s, AVERAGE time is %f s, count %d, sum %e\n",(float) (stop-start)/CLOCKS_PER_SEC,(float) (sum+stop-start)/(count*CLOCKS_PER_SEC),count,sum+stop-start);
sum=(sum+stop-start);*/
v2 = 2.0*UNI-1.0;
r = v1*v1+v2*v2;
//printf("Inside do: r %e\n",r);
} while (r >= 1.0);
//printf("outside do: r %e\n",r);
fac = sqrt(-2.0*log(r)/r);
gset= v1*fac;
iset=1;
return(sqrt(variance)*v2*fac + mean);
} else {
iset=0;
//printf("normal random number %e, max %e, min %e\n",sqrt(variance)*gset + mean, max,min);
if (max<sqrt(variance)*gset + mean)
max=sqrt(variance)*gset + mean;
if (min>sqrt(variance)*gset + mean)
......@@ -422,8 +283,53 @@ double gaussdouble(double mean, double variance)
return(sqrt(variance)*gset + mean);
}
}
#ifdef MAIN
main(int argc,char **argv)
{
int i;
randominit();
for (i=0; i<10; i++) {
printf("%f\n",gaussdouble(0.0,1.0));
}
}
#endif
/*void uniformrandomSSE(__m128d *d1,__m128d *d2)
{
int i,j;
__m128d u_rand128;
__m128i j128;
int j_array[]={(int) (1 + 9.103678167e-08*iy_array[0]),(int) (1 + 9.103678167e-08*iy_array[1]),(int) (1 + 9.103678167e-08*iy_array[2]),(int) (1 + 9.103678167e-08*iy_array[3])};
//j128=_mm_setr_epi32(1 + 391.0*iy_array[0]/mod,1 + 391.0*iy_array[1]/mod,1 + 391.0*iy_array[2]/mod,1 + 391.0*iy_array[3]/mod);
//j128=_mm_setr_epi32(391,391,391,391);
//j128=_mm_mul_epu32(j128,_mm_setr_epi32(iy_array[0],iy_array[1],iy_array[2],iy_array[3]));
//j128=_mm_slli_epi32(j128,32);
//j=1 + 97.0*iy/mod;
iy_array[0]=ir[j_array[0]];
iy_array[1]=ir[j_array[1]];
iy_array[2]=ir[j_array[2]];
iy_array[3]=ir[j_array[3]];
seed_array[0] = a*seed_array[0];
seed_array[1] = a*seed_array[1];
seed_array[2] = a*seed_array[2];
seed_array[3] = a*seed_array[3];
ir[j_array[0]] = seed_array[0];
ir[j_array[1]] = seed_array[1];
ir[j_array[2]] = seed_array[2];
ir[j_array[3]] = seed_array[3];
*d1=_mm_setr_pd (2*((double) iy_array[0]/mod)-1, 2*((double) iy_array[1])-1);
*d2=_mm_setr_pd (2*((double) iy_array[2]/mod)-1, 2*((double) iy_array[3])-1);
//return ((double) iy/mod );
return;
}*/
/*static void gaussfloat_sse2(float* data, size_t count) {
//assert(count % 8 == 0);
//LCG<__m128> r;
......@@ -438,17 +344,90 @@ double gaussdouble(double mean, double variance)
_mm_store_ps(&data[i + 4], _mm_mul_ps(radius, sintheta));
}
}*/
#ifdef MAIN
main(int argc,char **argv)
/*#define randominit_SSE
#ifdef randominit_SSE
void randominit(unsigned seed_init)
{
int i;
// this need to be integrated with the existing rng, like taus: navid
msg("Initializing random number generator, seed %x\n",seed_init);
randominit();
if (seed_init == 0) {
srand((unsigned)time(NULL));
seed_array[0] = (unsigned int) rand();
seed_array[1] = (unsigned int) rand();
seed_array[2] = (unsigned int) rand();
seed_array[3] = (unsigned int) rand();
seed = (unsigned int) rand();
} else {
seed = seed_init;
seed_array[0] = seed_init;
seed_array[1] = log(seed_init);
seed_array[2] = pow(seed_init,3);
seed_array[3] = sqrt(seed_init);
}
if (seed % 2 == 0) seed += 1; // seed and mod are relative prime
for (i=0;i<4;i++)
if (seed_array[i] % 2 == 0) seed_array[i] += 1; // seed and mod are relative prime
for (i=0; i<10; i++) {
printf("%f\n",gaussdouble(0.0,1.0));
for (i=1; i<4*98; i++) { //4 times 98 to use in SSE implementations
seed_array[i%4] = a*seed_array[i%4]; // mod 2**32
ir[i]= seed_array[i%4]; // initialize the shuffle table
}
}
#endif
for ( i = 0; i < 4*98; i++ ){
printf("ir[%d]: %d\n",i,ir[i]);
}
iy=1;
}*/
//#define jsr4 (jz=jsr, jsr^=(jsr<<13),jsr^=(jsr>>17),jsr^=(jsr<<5),jz+jsr,printf("seed %d, next seed %d\n",jz,jsr))
//#define SHR3_SSE (jsr_128=_mm_loadu_si128(jsr4),jz_128=jsr_128, printf("jsr4 is %lu,%lu,%lu,%lu\n",jsr4[0],jsr4[1],jsr4[2],jsr4[3]), jsr_128=_mm_xor_si128(_mm_slli_epi32(jsr_128,13),jsr_128),_mm_storeu_si128((__m128i *)jsr4,jsr_128),printf("jsr128<<13 is %lu,%lu,%lu,%lu\n",jsr4[0],jsr4[1],jsr4[2],jsr4[3]), jsr_128=_mm_xor_si128(_mm_srli_epi32(jsr_128,17),jsr_128),_mm_storeu_si128((__m128i *)jsr4,jsr_128),printf("jsr128>>17 is %lu,%lu,%lu,%lu\n",jsr4[0],jsr4[1],jsr4[2],jsr4[3]), jsr_128=_mm_xor_si128(_mm_slli_epi32(jsr_128,5),jsr_128),_mm_storeu_si128((__m128i *)jsr4,jsr_128), printf("jsr128<<5 is %lu,%lu,%lu,%lu\n",jsr4[0],jsr4[1],jsr4[2],jsr4[3]), _mm_storeu_si128(out,_mm_add_epi32(jz_128,jsr_128)),printf("out is %lu,%lu,%lu,%lu\n",out[0],out[1],out[2],out[3]),_mm_add_epi32(jz_128,jsr_128))
//#define UNI_SSE (_mm_storeu_ps(out,_mm_add_ps(_mm_mul_ps(_mm_set1_ps(0.2328306e-9),_mm_cvtepi32_ps(SHR3_SSE)),_mm_set1_ps(0.5))),printf("out is %e,%e,%e,%e\n",out[0],out[1],out[2],out[3]),_mm_add_ps(_mm_mul_ps(_mm_set1_ps(0.2328306e-9),_mm_cvtepi32_ps(SHR3_SSE)),_mm_set1_ps(0.5)))
/*double gaussdouble(double mean, double variance)
{
static int iset=0;
static float gset;
float fac,rn,r[2],v1[2],v2[2],ones[]={1.0,1.0,1.0,1.0};
static double max=-1000000;
static double min=1000000;
__m128 v1_128, v2_128, r128, ones128,compge128_mask;
ones128 = _mm_load_ps(ones);
if (iset == 0) {
do {
//v2 = 2.0*uniformrandom()-1.0;
v1_128 = _mm_set1_ps(2*UNI-1);
v2_128 = _mm_set1_ps(2*UNI-1);
//r = v1*v1+v2*v2;
r128= _mm_add_ps(_mm_mul_ps(v1_128,v1_128),_mm_mul_ps(v2_128,v2_128));
compge128_mask=_mm_cmpge_ps(r128,ones128);
//_mm_storeu_ps(&r[0],r128);
//printf("Inside do: r[0] %e, r[1] %e\n",r[0],r[1]);
} while (compge128_mask[0] != 4294967295 && compge128_mask[1]!=4294967295 && compge128_mask[2]!=4294967295 && compge128_mask[3]!=4294967295);
//printf("outside do: r[0] %e, r[1] %e\n",r[0],r[1]);
if (r[0]<r[1]){
fac = sqrt(-2.0*log(r[0])/r[0]);
gset= v1[0]*fac;
iset=1;
return(sqrt(variance)*v1[1]*fac + mean);
}
else{
fac = sqrt(-2.0*log(r[1])/r[1]);
gset= v2[0]*fac;
iset=1;
return(sqrt(variance)*v2[1]*fac + mean);
}
} else {
iset=0;
//printf("normal random number %e, max %e, min %e\n",sqrt(variance)*gset + mean, max,min);
if (max<sqrt(variance)*gset + mean)
max=sqrt(variance)*gset + mean;
if (min>sqrt(variance)*gset + mean)
min=sqrt(variance)*gset + mean;
return(sqrt(variance)*gset + mean);
}
}*/
......@@ -17,6 +17,13 @@
*-------------------------------------------------------------------------------
* For more information about the OpenAirInterface (OAI) Software Alliance:
* contact@openairinterface.org
*-------------------------------------------------------------------------------
* Optimization using SIMD instructions
* Frecuency Domain Analysis
* Luis Felipe Ariza Vesga, email:lfarizav@unal.edu.co
* Functions: do_DL_sig_freq(), do_UL_sig_freq(), init_channel_vars_freq(),
* do_UL_sig_freq_prach.
*-------------------------------------------------------------------------------
*/
#include <string.h>
......
Markdown is supported
0%
or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment