Commit c6bdf445 authored by Luis Ariza's avatar Luis Ariza

Ziggurat version SSE finished. Next step use AVX or AVX2 instructions to...

Ziggurat version SSE finished. Next step use AVX or AVX2 instructions to achieve realtime simulations in frequency domain
parent 02f1ec69
......@@ -472,7 +472,7 @@ double gaussdouble(double,double);
void randominit(unsigned int seed_init);
void table_nor(unsigned long seed);
double nfix(void);
__m128 nfix_SSE(void);
__m128 nfix_SSE(__m128i iz);
__m128 log_ps(__m128 x);
__m128 exp_ps(__m128 x);
double uniformrandom(void);
......
......@@ -158,7 +158,7 @@ static __m128 x __attribute__((aligned(16)));
#define SHR3_SSE (jsr_128=_mm_loadu_si128((__m128i *)jsr4),jz_128=jsr_128, jsr_128=_mm_xor_si128(_mm_slli_epi32(jsr_128,13),jsr_128),jsr_128=_mm_xor_si128(_mm_srli_epi32(jsr_128,17),jsr_128),jsr_128=_mm_xor_si128(_mm_slli_epi32(jsr_128,5),jsr_128),_mm_storeu_si128((__m128i *)jsr4,jsr_128),_mm_add_epi32(jz_128,jsr_128))
#define UNI_SSE (_mm_add_ps(_mm_mul_ps(_mm_set1_ps(0.2328306e-9),_mm_cvtepi32_ps(SHR3_SSE)),_mm_set1_ps(0.5)))
#define NOR_SSE (hz_128=SHR3_SSE,_mm_storeu_si128((__m128i *)hz4,hz_128),iz_128=_mm_and_si128(hz_128,_mm_set1_epi32(127)),_mm_storeu_si128((__m128i *)iz4,iz_128),abs_hz_128=_mm_and_si128(hz_128, _mm_set1_epi32(~0x80000000)),cmplt_option0_128 = _mm_cmplt_epi32(abs_hz_128,_mm_setr_epi32(kn[iz4[0]],kn[iz4[1]],kn[iz4[2]],kn[iz4[3]])),count99=(count99>99)?0:count99+4,nfix_first_run=(count99>99)?0:1,(_mm_testc_si128(cmplt_option0_128,_mm_setr_epi32(0xFFFFFFFF,0xFFFFFFFF,0xFFFFFFFF,0xFFFFFFFF)))?x=_mm_mul_ps(_mm_cvtepi32_ps(hz_128),_mm_setr_ps(wn[iz4[0]],wn[iz4[1]],wn[iz4[2]],wn[iz4[3]])):nfix_SSE())
#define NOR_SSE (hz_128=SHR3_SSE,_mm_storeu_si128((__m128i *)hz4,hz_128),iz_128=_mm_and_si128(hz_128,_mm_set1_epi32(127)),_mm_storeu_si128((__m128i *)iz4,iz_128),abs_hz_128=_mm_and_si128(hz_128, _mm_set1_epi32(~0x80000000)),cmplt_option0_128 = _mm_cmplt_epi32(abs_hz_128,_mm_setr_epi32(kn[iz4[0]],kn[iz4[1]],kn[iz4[2]],kn[iz4[3]])),count99=(count99>99)?0:count99+4,nfix_first_run=(count99>99)?0:1,(_mm_testc_si128(cmplt_option0_128,_mm_setr_epi32(0xFFFFFFFF,0xFFFFFFFF,0xFFFFFFFF,0xFFFFFFFF)))?x=_mm_mul_ps(_mm_cvtepi32_ps(hz_128),_mm_setr_ps(wn[iz4[0]],wn[iz4[1]],wn[iz4[2]],wn[iz4[3]])):nfix_SSE(iz_128))
//#define NOR1_SSE (hz1_128=SHR3_SSE,_mm_storeu_si128((__m128i *)hz1,hz1_128),iz1_128=_mm_and_si128(hz1_128,_mm_set1_epi32(127)),_mm_storeu_si128((__m128i *)iz1,iz1_128),abs_hz1_128=_mm_and_si128(hz1_128, _mm_set1_epi32(~0x80000000)),_mm_storeu_si128((__m128i *)abshz1,abs_hz1_128))
......@@ -171,7 +171,7 @@ static __m128 x __attribute__((aligned(16)));
//,_mm_storeu_si128(ssh3_sse4,hz_128),printf("ssh3_sse4 %lu,%lu,%lu,%lu\n",ssh3_sse4[0],ssh3_sse4[1],ssh3_sse4[2],ssh3_sse4[3])
//#define NOR (hz=SHR3, printf("hz %d\n",hz),sign=(hz&128)>>7,printf("sign %s\n",(sign)?"-":"+"),iz=hz&127,printf("iz %d\n",iz), (abs(hz)<kn[iz])? (sign)?(-1)*hz*wn[iz]:hz*wn[iz] : (sign)?(-1)*nfix():nfix())
__m128 nfix_SSE(void)
__m128 nfix_SSE(__m128i iz)
{
__m128 y __attribute__((aligned(16)));
__m128i cmplt_option1_128 __attribute__((aligned(16)));
......@@ -185,9 +185,9 @@ __m128 nfix_SSE(void)
int i;
static float r = 3.442620;
uint32_t iz4_i[4] __attribute__((aligned(16))) ;
_mm_storeu_si128((__m128i *)iz4_i,iz_128);
//x=hz * wn[iz];
_mm_storeu_si128((__m128i *)iz4_i,iz_128);
_mm_storeu_si128((__m128i *)cmplt_option0,cmplt_option0_128);
_mm_storeu_ps(x4_option0,x);
count0=0;
......@@ -222,9 +222,9 @@ __m128 nfix_SSE(void)
}
}
while (cmplt_option1[0]!=0x80000000 && cmplt_option1[1]!=0x80000000 && cmplt_option1[2]!=0x80000000 && cmplt_option1[3]!=0x80000000);
return _mm_setr_ps(output[0],output[1],output[2],output[3]);
//return _mm_setr_ps(output[0],output[1],output[2],output[3]);
}
else if (iz4[0]>0&&iz4[1]>0&&iz4[2]>0&&iz4[3]>0&&nfix_first_run==0&&count0>0)
else if (iz4_i[0]>0&&iz4_i[1]>0&&iz4_i[2]>0&&iz4_i[3]>0&&nfix_first_run==0&&count0>0)
{
nfix_first_run=1;
cmplt_option2_128 = _mm_cvtps_epi32(_mm_cmplt_ps(_mm_add_ps(_mm_setr_ps(fn[iz4_i[0]],fn[iz4_i[1]],fn[iz4_i[2]],fn[iz4_i[3]]),_mm_mul_ps(UNI_SSE,_mm_sub_ps(_mm_setr_ps(fn[iz4_i[0]-1],fn[iz4_i[1]-1],fn[iz4_i[2]-1],fn[iz4_i[3]-1]),_mm_setr_ps(fn[iz4_i[0]],fn[iz4_i[1]],fn[iz4_i[2]],fn[iz4_i[3]])))),exp_ps(_mm_mul_ps(_mm_mul_ps(x,x),_mm_set1_ps(-0.5f)))));
......@@ -237,8 +237,31 @@ __m128 nfix_SSE(void)
break;
}
}
return _mm_setr_ps(output[0],output[1],output[2],output[3]);
}
//return _mm_setr_ps(output[0],output[1],output[2],output[3]);
}
if (count0==3)
{
return _mm_setr_ps(output[0],output[1],output[2],output[3]);
}
else
{
hz_128=SHR3_SSE;
_mm_storeu_si128((__m128i *)hz4,hz_128);
iz_128=_mm_and_si128(hz_128,_mm_set1_epi32(127));
_mm_storeu_si128((__m128i *)iz4,iz_128);
abs_hz_128=_mm_and_si128(hz_128, _mm_set1_epi32(~0x80000000));
_mm_storeu_si128((__m128i *)iz4_i,iz_128);
_mm_storeu_si128((__m128i *)cmplt_option0,_mm_cmplt_epi32(abs_hz_128,_mm_setr_epi32(kn[iz4_i[0]],kn[iz4_i[1]],kn[iz4_i[2]],kn[iz4_i[3]])));
for (i=count0;i<3;i++)
{
if (cmplt_option0[i]==0xFFFFFFFF)
{
output[count0]=hz4[i]*wn[iz4_i[i]];
count0++;
}
}
return _mm_setr_ps(output[0],output[1],output[2],output[3]);
}
}
/*!\Procedure to create tables for normal distribution kn,wn and fn. */
......
Markdown is supported
0%
or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment