Commit 0d3505cd authored by Luis Ariza's avatar Luis Ariza

New SHR3_SSE and UNI_SSE functions. Optimization of sin and cos numbers in the channel model

parent 09c1f062
......@@ -92,7 +92,9 @@ void adc_SSE_float(float *r_re[2],
unsigned int **output,
unsigned int nb_rx_antennas,
unsigned int length,
unsigned char B)
unsigned char B,
unsigned int samples,
unsigned int ofdm_symbol_size)
{
int i;
int aa;
......@@ -100,19 +102,21 @@ void adc_SSE_float(float *r_re[2],
__m128i r_re128i, r_im128i,output128;
float gain = (float)(1<<(B-1));
gain128=_mm_set1_ps(gain);
for (i=0; i<(length>>2); i++) {
for (aa=0; aa<nb_rx_antennas; aa++) {
r_re128=_mm_loadu_ps(&r_re[aa][4*i+input_offset]);
r_im128=_mm_loadu_ps(&r_im[aa][4*i+input_offset]);
r_re128=_mm_mul_ps(r_re128,gain128);
r_im128=_mm_mul_ps(r_im128,gain128);
r_re128i=_mm_cvtps_epi32(r_re128);
r_im128i=_mm_cvtps_epi32(r_im128);
r_re128i=_mm_packs_epi32(r_re128i,r_re128i);
r_im128i=_mm_packs_epi32(r_im128i,r_im128i);
output128=_mm_unpacklo_epi16(r_re128i,r_im128i);
_mm_storeu_si128((__m128i *)&output[aa][4*i+output_offset],output128);
}
for (i=0; i<(length>>2); i++)
{
for (aa=0; aa<nb_rx_antennas; aa++)
{
r_re128=_mm_loadu_ps(&r_re[aa][4*i+input_offset]);
r_im128=_mm_loadu_ps(&r_im[aa][4*i+input_offset]);
r_re128=_mm_mul_ps(r_re128,gain128);
r_im128=_mm_mul_ps(r_im128,gain128);
r_re128i=_mm_cvtps_epi32(r_re128);
r_im128i=_mm_cvtps_epi32(r_im128);
r_re128i=_mm_packs_epi32(r_re128i,r_re128i);
r_im128i=_mm_packs_epi32(r_im128i,r_im128i);
output128=_mm_unpacklo_epi16(r_re128i,r_im128i);
_mm_storeu_si128((__m128i *)&output[aa][4*i+output_offset],output128);
}
}
}
......
......@@ -98,7 +98,9 @@ void adc_SSE_float(float *r_re[2],
int **output,
unsigned int nb_rx_antennas,
unsigned int length,
unsigned char B);
unsigned char B,
unsigned int samples,
unsigned int ofdm_symbol_size);
void adc_freq(double *r_re[2],
double *r_im[2],
unsigned int input_offset,
......
......@@ -391,7 +391,7 @@ void rf_rx_simple_freq_SSE_float(float *r_re[2],
float rx_gain_lin = pow(10.0,.05*rx_gain_dB);
//double rx_gain_lin = 1.0;
float N0W = pow(10.0,.1*(-174.0 - 10*log10(s_time*1e-9)));
float sqrt_NOW = sqrt(.5*N0W);
float sqrt_NOW = rx_gain_lin*sqrt(.5*N0W);
//double N0W = 0.0;
// printf("s_time=%f, N0W=%g\n",s_time,10*log10(N0W));
......@@ -404,9 +404,11 @@ void rf_rx_simple_freq_SSE_float(float *r_re[2],
//rx128_gain_lin=mm_loadu_pd(rx_gain_lin);
/*count++;
clock_t start=clock();*/
rx128_gain_lin = _mm_set1_ps(rx_gain_lin);
for (i=0; i<(length>>2); i++) {
for (a=0; a<nb_rx_antennas; a++) {
if (i%(ofdm_symbol_size>>2)>(n_samples>>2) && i%(ofdm_symbol_size>>2)<(ofdm_symbol_size>>2)-(n_samples>>2))
/*if (i%(ofdm_symbol_size>>2)>(n_samples>>2) && i%(ofdm_symbol_size>>2)<(ofdm_symbol_size>>2)-(n_samples>>2))
{
//printf("i = %d\n",i);
//_mm_storeu_pd(&r_re[a][2*i],_mm_setzero_pd());
......@@ -414,11 +416,10 @@ clock_t start=clock();*/
break;
}
else
{
{*/
//rx128_gain_lin=mm_mul_set1_ps(rx_gain_lin);
rx128_re = _mm_loadu_ps(&r_re[a][4*i]);//r_re[a][i],r_re[a][i+1]
rx128_im = _mm_loadu_ps(&r_im[a][4*i]);//r_im[a][i],r_im[a][i+1]
rx128_gain_lin = _mm_set1_ps(rx_gain_lin);
//start_meas(&desc->ziggurat);
gauss_0_128_sqrt_NOW = _mm_set_ps(ziggurat(0.0,1.0),ziggurat(0.0,1.0),ziggurat(0.0,1.0),ziggurat(0.0,1.0));
gauss_1_128_sqrt_NOW = _mm_set_ps(ziggurat(0.0,1.0),ziggurat(0.0,1.0),ziggurat(0.0,1.0),ziggurat(0.0,1.0));
......@@ -428,13 +429,11 @@ clock_t start=clock();*/
// Amplify by receiver gain and apply 3rd order non-linearity
//r_re[a][i] = rx_gain_lin*(r_re[a][i] + sqrt(.5*N0W)*gaussdouble(0.0,1.0));
//r_im[a][i] = rx_gain_lin*(r_im[a][i] + sqrt(.5*N0W)*gaussdouble(0.0,1.0));
rx128_re = _mm_add_ps(rx128_re,gauss_0_128_sqrt_NOW);
rx128_im = _mm_add_ps(rx128_im,gauss_1_128_sqrt_NOW);
rx128_re = _mm_mul_ps(rx128_re,rx128_gain_lin);
rx128_im = _mm_mul_ps(rx128_im,rx128_gain_lin);
rx128_re = _mm_add_ps(_mm_mul_ps(rx128_re,rx128_gain_lin),gauss_0_128_sqrt_NOW);
rx128_im = _mm_add_ps(_mm_mul_ps(rx128_im,rx128_gain_lin),gauss_1_128_sqrt_NOW);
_mm_storeu_ps(&r_re[a][4*i],rx128_re);
_mm_storeu_ps(&r_im[a][4*i],rx128_im);
}
//}
}
}
/*clock_t stop=clock();
......
......@@ -28,6 +28,7 @@
#include "PHY/TOOLS/defs.h"
#include "defs.h"
#include "PHY/sse_intrin.h"
// NEW code with lookup table for sin/cos based on delay profile (TO BE TESTED)
......@@ -181,7 +182,6 @@ int init_freq_channel_SSE_float(channel_desc_t *desc,uint16_t nb_rb,int16_t n_sa
else
delay = desc->delays[l]+NB_SAMPLES_CHANNEL_OFFSET/desc->sampling_rate;
//sincos_ps(_mm_set_ps(cos(twopi*(4*f)*delay),cos(twopi*(4*f)*delay)), &sin_lut128, &cos_lut128);
//sincos_ps(_mm_set_ps(twopi*(4*f+3)*delay,twopi*(4*f+2)*delay,twopi*(4*f+1)*delay,twopi*(4*f)*delay), &sin_lut128, &cos_lut128);
cos_lut128=_mm_set_ps(cos(twopi*(4*f+3)*delay),cos(twopi*(4*f+2)*delay),cos(twopi*(4*f+1)*delay),cos(twopi*(4*f)*delay));
sin_lut128=_mm_set_ps(sin(twopi*(4*f+3)*delay),sin(twopi*(4*f+2)*delay),sin(twopi*(4*f+1)*delay),sin(twopi*(4*f)*delay));
......@@ -224,11 +224,11 @@ int init_freq_channel_SSE_float(channel_desc_t *desc,uint16_t nb_rb,int16_t n_sa
}
}
for (f=-(n_samples>>1); f<=(n_samples>>1); f++) {
/*for (f=-(n_samples>>1); f<=(n_samples>>1); f++) {
for (l=0; l<(int)desc->nb_taps; l++) {
printf("f %d, l %d (cos,sin) (%e,%e):\n",f,l,cos_lut_f[l][f+(n_samples>>1)],sin_lut_f[l][f+(n_samples>>1)]);
}
}
}*/
return(0);
}
int freq_channel_SSE_float(channel_desc_t *desc,uint16_t nb_rb,int16_t n_samples)
......@@ -863,3 +863,98 @@ void sincos_ps(__m128 x, __m128 *s, __m128 *c) {
*c = _mm_xor_ps(xmm2, sign_bit_cos);
}
__m128 log_ps(__m128 x) {
#if defined(__x86_64__) || defined(__i386__)
__m128i emm0 __attribute__((aligned(16)));
__m128i* invalid_mask1 __attribute__((aligned(16)))=NULL;
__m128 invalid_mask __attribute__((aligned(16)));
__m128 one __attribute__((aligned(16)));
__m128i *y1 __attribute__((aligned(16)))=NULL;
__m128i l __attribute__((aligned(16)))=_mm_setr_epi32(1,2,3,49);
y1 = &l;
//l = _mm_storeu_si128(y1);
static uint32_t s[4] __attribute__((aligned(16)));
_mm_storeu_si128(s,_mm_setr_epi32(1,2,3,45));
printf("s is %d,%d,%d,%d\n",s[0],s[1],s[2],s[3]);
#endif
one = _mm_set1_ps(1.f);
printf("one_m128 is %e,%e,%e,%e\n",one[0],one[1],one[2],one[3]);
invalid_mask= _mm_cmple_ps(x, _mm_setzero_ps());
printf("ln inside bef x is %e,%e,%e,%e\n",x[0],x[1],x[2],x[3]);
printf("ln inside invalid_mask is %e,%e,%e,%e\n",invalid_mask[0],invalid_mask[1],invalid_mask[2],invalid_mask[3]);
invalid_mask1 = (__m128i*) &y1;
//invalid_mask =
printf("ln inside invalid_mask1 is %d,%d,%d,%d\n",invalid_mask1[0],invalid_mask1[1],invalid_mask1[2],invalid_mask1[3]);
x = _mm_max_ps(x, _mm_set1_ps(0x00800000)); // cut off denormalized stuff
printf("ln inside af x is %e,%e,%e,%e\n",x[0],x[1],x[2],x[3]);
// part 1: x = frexpf(x, &e);
emm0 = _mm_srli_epi32(_mm_castps_si128(x), 23);
// keep only the fractional part
x = _mm_and_ps(x, _mm_set1_ps(~0x7f800000));
printf("ln inside x is %e,%e,%e,%e\n",x[0],x[1],x[2],x[3]);
x = _mm_or_ps(x, _mm_set1_ps(0.5f));
printf("ln inside x is %e,%e,%e,%e\n",x[0],x[1],x[2],x[3]);
// now e=mm0:mm1 contain the really base-2 exponent
emm0 = _mm_sub_epi32(emm0, _mm_set1_epi32(0x7f));
__m128 e = _mm_cvtepi32_ps(emm0);
e = _mm_add_ps(e, one);
__m128 mask = _mm_cmplt_ps(x, _mm_set1_ps(0.707106781186547524f));
__m128 tmp = _mm_and_ps(x, mask);
x = _mm_sub_ps(x, one);
e = _mm_sub_ps(e, _mm_and_ps(one, mask));
x = _mm_add_ps(x, tmp);
__m128 z = _mm_mul_ps(x,x);
__m128 y = _mm_set1_ps(7.0376836292E-2f);
y = _mm_mul_ps(y, x);
y = _mm_add_ps(y, _mm_set1_ps(- 1.1514610310E-1f));
y = _mm_mul_ps(y, x);
y = _mm_add_ps(y, _mm_set1_ps(1.1676998740E-1f));
y = _mm_mul_ps(y, x);
y = _mm_add_ps(y, _mm_set1_ps(- 1.2420140846E-1f));
y = _mm_mul_ps(y, x);
y = _mm_add_ps(y, _mm_set1_ps(1.4249322787E-1f));
y = _mm_mul_ps(y, x);
y = _mm_add_ps(y, _mm_set1_ps(- 1.6668057665E-1f));
y = _mm_mul_ps(y, x);
y = _mm_add_ps(y, _mm_set1_ps(2.0000714765E-1f));
y = _mm_mul_ps(y, x);
y = _mm_add_ps(y, _mm_set1_ps(- 2.4999993993E-1f));
y = _mm_mul_ps(y, x);
y = _mm_add_ps(y, _mm_set1_ps(3.3333331174E-1f));
y = _mm_mul_ps(y, x);
y = _mm_mul_ps(y, z);
tmp = _mm_mul_ps(e, _mm_set1_ps(-2.12194440e-4f));
y = _mm_add_ps(y, tmp);
tmp = _mm_mul_ps(z, _mm_set1_ps(0.5f));
y = _mm_sub_ps(y, tmp);
tmp = _mm_mul_ps(e, _mm_set1_ps(0.693359375f));
x = _mm_add_ps(x, y);
x = _mm_add_ps(x, tmp);
x = _mm_or_ps(x, invalid_mask); // negative arg will be NAN
return x;
}
......@@ -472,6 +472,7 @@ double gaussdouble(double,double);
void randominit(unsigned int seed_init);
void table_nor(unsigned long seed);
double nfix(void);
__m128 log_ps(__m128 x);
double uniformrandom(void);
void uniformrandomSSE(__m128d *d1,__m128d *d2);
double ziggurat(double mean, double variance);
......
......@@ -27,6 +27,7 @@
#endif
#include "defs.h"
#include "PHY/sse_intrin.h"
static unsigned int seed, iy, ir[98];
......@@ -147,12 +148,34 @@ double uniformrandom(void)
/*!\brief Ziggurat random number generator based on rejection sampling. It returns a pseudorandom normally distributed double number with x=0 and variance=1*/
//Procedure to create tables for normal distribution kn,wn and fn
//#define SHR3 (jz=jsr,printf("jsr %lu ",jsr), jsr^=(jsr<<13),printf("jsr<<13 %lu ",jsr),jsr^=(jsr>>17),printf("jsr>>17 %lu ",jsr),jsr^=(jsr<<5),printf("jsr<<5 %lu\n",jsr),printf("SHR3 jsr %lu, jz %lu, jsr+jz %lu\n",jsr,jz,jz+jsr),jz+jsr)
#define SHR3 (jz=jsr, jsr^=(jsr<<13),jsr^=(jsr>>17),jsr^=(jsr<<5),jz+jsr)
#define UNI (0.5+(signed) SHR3 * 0.2328306e-9)
#define NOR (hz=SHR3,iz=(hz&127),(abs(hz)<kn[iz])? hz*wn[iz] : nfix())
static double wn[128],fn[128];
static unsigned long iz,jz,jsr=123456789,kn[128];
static uint32_t iz,jz,jsr=123456789,kn[128];
static int32_t hz;
static uint32_t jsr4[4] __attribute__((aligned(16))) = {123456789,2714967881,2238813396,1250077441};//This initialization depends on the seed for nor_table function in oaisim_functions.c file.
static uint32_t out[4] __attribute__((aligned(16)));
#if defined(__x86_64__) || defined(__i386__)
static __m128i jsr_128 __attribute__((aligned(16)));
static __m128i jz_128 __attribute__((aligned(16)));
static __m128i hz_128 __attribute__((aligned(16)));
static __m128i iz_128 __attribute__((aligned(16)));
//#define jsr4 (jz=jsr, jsr^=(jsr<<13),jsr^=(jsr>>17),jsr^=(jsr<<5),jz+jsr,printf("seed %d, next seed %d\n",jz,jsr))
//#define SHR3_SSE (jsr_128=_mm_loadu_si128(jsr4),jz_128=jsr_128, printf("jsr4 is %lu,%lu,%lu,%lu\n",jsr4[0],jsr4[1],jsr4[2],jsr4[3]), jsr_128=_mm_xor_si128(_mm_slli_epi32(jsr_128,13),jsr_128),_mm_storeu_si128((__m128i *)jsr4,jsr_128),printf("jsr128<<13 is %lu,%lu,%lu,%lu\n",jsr4[0],jsr4[1],jsr4[2],jsr4[3]), jsr_128=_mm_xor_si128(_mm_srli_epi32(jsr_128,17),jsr_128),_mm_storeu_si128((__m128i *)jsr4,jsr_128),printf("jsr128>>17 is %lu,%lu,%lu,%lu\n",jsr4[0],jsr4[1],jsr4[2],jsr4[3]), jsr_128=_mm_xor_si128(_mm_slli_epi32(jsr_128,5),jsr_128),_mm_storeu_si128((__m128i *)jsr4,jsr_128), printf("jsr128<<5 is %lu,%lu,%lu,%lu\n",jsr4[0],jsr4[1],jsr4[2],jsr4[3]), _mm_storeu_si128(out,_mm_add_epi32(jz_128,jsr_128)),printf("out is %lu,%lu,%lu,%lu\n",out[0],out[1],out[2],out[3]),_mm_add_epi32(jz_128,jsr_128))
#define SHR3_SSE (jsr_128=_mm_loadu_si128(jsr4),jz_128=jsr_128, jsr_128=_mm_xor_si128(_mm_slli_epi32(jsr_128,13),jsr_128),jsr_128=_mm_xor_si128(_mm_srli_epi32(jsr_128,17),jsr_128),jsr_128=_mm_xor_si128(_mm_slli_epi32(jsr_128,5),jsr_128),_mm_storeu_si128((__m128i *)jsr4,jsr_128),_mm_add_epi32(jz_128,jsr_128))
#define UNI_SSE (_mm_add_ps(_mm_mul_ps(_mm_set1_ps(0.2328306e-9),_mm_cvtepi32_ps(SHR3_SSE)),_mm_set1_ps(0.5)))
#define NOR_SSE (hz_128=SHR3_SSE,iz_128=_mm_and_epi32(hz_128,_mm_set1_epi32(127)),_mm_cmplt_epi32(_mm_max_epi32(_mm_sub_epi32(_mm_setzero_si128(),hz_128),hz_128),_mm_setr_epi32(kn[iz_128[0]],kn[iz_128[1]],kn[iz_128[2]],kn[iz_128[3]])),_mm_mul_ps(hz_128,_mm_set_ps(wn[iz_128[0],wn[iz_128[1],wn[iz_128[2],wn[iz_128[3]))): nfix())
#endif
//#define NOR (hz=SHR3, printf("hz %d\n",hz),sign=(hz&128)>>7,printf("sign %s\n",(sign)?"-":"+"),iz=hz&127,printf("iz %d\n",iz), (abs(hz)<kn[iz])? (sign)?(-1)*hz*wn[iz]:hz*wn[iz] : (sign)?(-1)*nfix():nfix())
double nfix(void)
{
......@@ -225,38 +248,107 @@ void table_nor(unsigned long seed)
}
double ziggurat(double mean, double variance)
{
//printf("UNI is %e\n",UNI);
//printf("SHR3 is %d\n",SHR3);
/*__m128i m __attribute__((aligned(16)));
__m128 n __attribute__((aligned(16)));
static uint32_t mm[4] __attribute__((aligned(16)));
static uint32_t aa[4] __attribute__((aligned(16)));
static uint32_t bb[4] __attribute__((aligned(16)));
static float cc[4] __attribute__((aligned(16)));
//printf("SHR3_SSE jsr4 is %d,%d,%d,%d\n\n",jsr4[0],jsr4[1],jsr4[2],jsr4[3]);
m=SHR3_SSE;
printf("UNI is %e,%e,%e,%e\n",UNI,UNI,UNI,UNI);
printf("UNI is %e,%e,%e,%e\n",UNI,UNI,UNI,UNI);
printf("UNI is %e,%e,%e,%e\n",UNI,UNI,UNI,UNI);
printf("UNI is %e,%e,%e,%e\n",UNI,UNI,UNI,UNI);
n=UNI_SSE;
printf("UNI_SSE is %e,%e,%e,%e\n\n",n[0],n[1],n[2],n[3]);
_mm_storeu_si128((__m128i *)mm,jsr_128);
_mm_storeu_si128((__m128i *)aa,jz_128);
_mm_storeu_si128((__m128i *)bb,m);
_mm_storeu_ps((__m128 *)cc,n);
printf("SHR3_SSE jsr128 is %d,%d,%d,%d\n",mm[0],mm[1],mm[2],mm[3]);
printf("SHR3_SSE jz128 is %d,%d,%d,%d\n",aa[0],aa[1],aa[2],aa[3]);
printf("SHR3_SSE jsr_128+jz_128 is %d,%d,%d,%d\n",bb[0],bb[1],bb[2],bb[3]);
printf("SHR3_SSE norm is %d,%d,%d,%d\n",cc[0],cc[1],cc[2],cc[3]);
//printf("SHR3_SSE jsr4 is %d,%d,%d,%d\n",jsr4[0],jsr4[1],jsr4[2],jsr4[3]);
//printf("SHR3_SSE jz_128 is %d,%d,%d,%d\n\n",jz_128[0],jz_128[1],jz_128[2],jz_128[3]);
__m128 x ;
x=log_ps(_mm_set_ps(-10,-100,-1000,100000));
printf("ln aprox is %e,%e,%e,%e\n",x[0],x[1],x[2],x[3]);
printf("ln function is %e,%e,%e,%e\n",log(-10),log(-100),log(-1000),log(100000));
uint32_t out[4] __attribute__((aligned(16)));
__m128i m __attribute__((aligned(16)));
m=_mm_max_epi32(_mm_setr_epi32(1,-1,3,-45),_mm_setr_epi32(-1,1,-3,45));
_mm_storeu_si128(out,m);
printf("abs is %d,%d,%d,%d\n",out[0],out[1],out[2],out[3]);*/
return NOR;
}
void boxmuller_SSE_float(float* data, size_t count) {
assert(count % 8 == 0);
__m128 twopi = _mm_set1_ps(2.0f * 3.14159265358979323846f);
__m128 one = _mm_set1_ps(1.0f);
__m128 minustwo = _mm_set1_ps(-2.0f);
__m128 u1_ps,u2_ps;
__m128 r_ps,radius,theta,sintheta,costheta;
__m128i a_ps, b_ps;
__m128i x_ps;
__m128i u_ps;
//LCG<__m128> r;
x_ps=_mm_setr_epi32(10, 100, 1000, 10000);
a_ps=x_ps;
b_ps=_mm_set1_epi32(1664525);
const __m128i tmp1 = _mm_mul_epu32(a_ps, b_ps);
const __m128i tmp2 = _mm_mul_epu32(_mm_srli_si128(a_ps, 4), _mm_srli_si128(b_ps, 4));
x_ps=_mm_add_epi32(_mm_unpacklo_epi32(_mm_shuffle_epi32(tmp1, _MM_SHUFFLE (0,0,2,0)), _mm_shuffle_epi32(tmp2, _MM_SHUFFLE (0,0,2,0))),_mm_set1_epi32(1013904223));
u_ps = _mm_or_si128(_mm_srli_epi32(x_ps, 9), _mm_set1_epi32(0x3F800000));
r_ps = _mm_sub_ps(_mm_castsi128_ps(u_ps), _mm_set1_ps(1));
for (size_t i = 0; i < count; i += 8) {
u1_ps = _mm_sub_ps(one, r_ps); // [0, 1) -> (0, 1]
u2_ps = r_ps;
radius = _mm_sqrt_ps(_mm_mul_ps(minustwo, _mm_set_ps(log(u1_ps[0]),log(u1_ps[1]),log(u1_ps[2]),log(u1_ps[3]))));
theta = _mm_mul_ps(twopi, u2_ps);
sincos_ps(theta, &sintheta, &costheta);
_mm_store_ps(&data[i ], _mm_mul_ps(radius, costheta));
_mm_store_ps(&data[i + 4], _mm_mul_ps(radius, sintheta));
}
}
/*
@defgroup _gaussdouble Gaussian random number generator based on modified Box-Muller transformation.
@ingroup numerical
*/
/*!\brief Gaussian random number generator based on modified Box-Muller transformation.Returns a double-precision floating-point number. */
//#define random_SSE
#define random_SSE
#ifdef random_SSE
double gaussdouble(double mean, double variance)
{
static int iset=0;
static double gset;
double fac,rn,r[2],v1[2],v2[2],ones[]={1.0,1.0};
static float gset;
float fac,rn,r[2],v1[2],v2[2],ones[]={1.0,1.0,1.0,1.0};
static double max=-1000000;
static double min=1000000;
__m128d v1_128, v2_128, r128, ones128, compge128;
ones128 = _mm_load_pd(ones);
__m128 v1_128, v2_128, r128, ones128,compge128_mask;
ones128 = _mm_load_ps(ones);
if (iset == 0) {
do {
//v2 = 2.0*uniformrandom()-1.0;
v1_128 = _mm_set_pd(2*UNI-1,2*UNI-1);
v2_128 = _mm_set_pd(2*UNI-1,2*UNI-1);
v1_128 = _mm_set1_ps(2*UNI-1);
v2_128 = _mm_set1_ps(2*UNI-1);
//r = v1*v1+v2*v2;
v1_128= _mm_mul_pd(v1_128,v1_128);
v2_128= _mm_mul_pd(v2_128,v2_128);
r128= _mm_add_pd(v1_128,v2_128);
//compge128=_mm_cmpge_pd(r128,ones128);
_mm_storeu_pd(&r[0],r128);
r128= _mm_add_ps(_mm_mul_ps(v1_128,v1_128),_mm_mul_ps(v2_128,v2_128));
compge128_mask=_mm_cmpge_ps(r128,ones128);
//_mm_storeu_ps(&r[0],r128);
//printf("Inside do: r[0] %e, r[1] %e\n",r[0],r[1]);
} while (r[0] >= 1.0 && r[1]>=1.0);
} while (compge128_mask[0] != 4294967295 && compge128_mask[1]!=4294967295 && compge128_mask[2]!=4294967295 && compge128_mask[3]!=4294967295);
//printf("outside do: r[0] %e, r[1] %e\n",r[0],r[1]);
if (r[0]<r[1]){
fac = sqrt(-2.0*log(r[0])/r[0]);
......@@ -329,10 +421,10 @@ double gaussdouble(double mean, double variance)
}
}
#endif
/*
static void gaussfloat_sse2(float* data, size_t count) {
/*static void gaussfloat_sse2(float* data, size_t count) {
//assert(count % 8 == 0);
LCG<__m128> r;
//LCG<__m128> r;
for (int i = 0; i < count; i += 8) {
__m128 u1 = _mm_sub_ps(_mm_set1_ps(1.0f), r()); // [0, 1) -> (0, 1]
__m128 u2 = r();
......@@ -343,8 +435,8 @@ static void gaussfloat_sse2(float* data, size_t count) {
_mm_store_ps(&data[i ], _mm_mul_ps(radius, costheta));
_mm_store_ps(&data[i + 4], _mm_mul_ps(radius, sintheta));
}
}
*/
}*/
#ifdef MAIN
main(int argc,char **argv)
{
......
......@@ -682,7 +682,9 @@ void do_DL_sig_freq(channel_desc_t *eNB2UE[NUMBER_OF_eNB_MAX][NUMBER_OF_UE_MAX][
rxdataF,
nb_antennas_rx,
frame_parms->ofdm_symbol_size*frame_parms->symbols_per_tti,
12);
12,
PHY_vars_UE_g[UE_id][CC_id]->frame_parms.N_RB_DL*12,
frame_parms->ofdm_symbol_size);
#else
adc(r_re_p_f,
r_im_p_f,
......@@ -1206,7 +1208,9 @@ void do_UL_sig_freq(channel_desc_t *UE2eNB[NUMBER_OF_UE_MAX][NUMBER_OF_eNB_MAX][
rxdataF,
nb_antennas_rx,
frame_parms->ofdm_symbol_size*frame_parms->symbols_per_tti,
12);
12,
PHY_vars_eNB_g[eNB_id][CC_id]->frame_parms.N_RB_DL*12,
frame_parms->ofdm_symbol_size);
#else
adc(r_re_p_f,
r_im_p_f,
......
......@@ -995,7 +995,7 @@ void init_seed(uint8_t set_seed)
} else {
randominit (0);
table_nor(1234567891);//Setup for the normal probability distribution function. Always use a non-zero unsigned long argument.
table_nor(123456789);//Setup for the normal probability distribution function. Always use a non-zero unsigned long argument.
set_taus_seed (0);
}
}
......
Markdown is supported
0%
or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment