Commit 015187e5 authored by Raymond Knopp's avatar Raymond Knopp

AVX2 optimization activated for gamma computation in 16-bit turbo decoder (single-codeword)

parent 8d4405bd
...@@ -134,7 +134,7 @@ else (CMAKE_SYSTEM_PROCESSOR STREQUAL "armv7l") ...@@ -134,7 +134,7 @@ else (CMAKE_SYSTEM_PROCESSOR STREQUAL "armv7l")
set(C_FLAGS_PROCESSOR "${C_FLAGS_PROCESSOR} -mavx2") set(C_FLAGS_PROCESSOR "${C_FLAGS_PROCESSOR} -mavx2")
endif() endif()
if (CPUINFO MATCHES "sse4_2") if (CPUINFO MATCHES "sse4_2")
set(C_FLAGS_PROCESSOR "${C_FLAGS_PROCESSOR} -mavx2 -msse4.2") set(C_FLAGS_PROCESSOR "${C_FLAGS_PROCESSOR} -mavx2 -msse4.2 -fno-tree-vectorize")
endif() endif()
if (CPUINFO MATCHES "sse4_1") if (CPUINFO MATCHES "sse4_1")
set(C_FLAGS_PROCESSOR "${C_FLAGS_PROCESSOR} -msse4.1") set(C_FLAGS_PROCESSOR "${C_FLAGS_PROCESSOR} -msse4.1")
......
...@@ -223,6 +223,7 @@ char interleave_compact_byte(short * base_interleaver,unsigned char * input, uns ...@@ -223,6 +223,7 @@ char interleave_compact_byte(short * base_interleaver,unsigned char * input, uns
0b00000001}; 0b00000001};
#endif #endif
#ifndef __AVX2__ #ifndef __AVX2__
if ((n&15) > 0) if ((n&15) > 0)
loop++; loop++;
......
...@@ -186,12 +186,16 @@ void compute_alpha16avx2(llr_t* alpha,llr_t* beta,llr_t* m_11,llr_t* m_10,uint16 ...@@ -186,12 +186,16 @@ void compute_alpha16avx2(llr_t* alpha,llr_t* beta,llr_t* m_11,llr_t* m_10,uint16
__m256i new0,new1,new2,new3,new4,new5,new6,new7; __m256i new0,new1,new2,new3,new4,new5,new6,new7;
__m256i alpha_max; __m256i alpha_max;
unsigned long long timein,timeout;
l2 = L>>3; l2 = L>>3;
K1 = (frame_length>>3); K1 = (frame_length>>3);
#ifdef DEBUG_LOGMAP #ifdef DEBUG_LOGMAP
fprintf(fdavx2,"Compute alpha (avx2_16bit)\n"); fprintf(fdavx2,"Compute alpha (avx2_16bit)\n");
fprintf(fdavx2b,"Compute alpha (avx2_16bit)\n"); fprintf(fdavx2b,"Compute alpha (avx2_16bit)\n");
#endif #endif
timein = rdtsc_oai();
for (l=K1;; l=l2,rerun_flag=1) { for (l=K1;; l=l2,rerun_flag=1) {
alpha128 = (__m256i *)alpha; alpha128 = (__m256i *)alpha;
...@@ -378,6 +382,9 @@ void compute_alpha16avx2(llr_t* alpha,llr_t* beta,llr_t* m_11,llr_t* m_10,uint16 ...@@ -378,6 +382,9 @@ void compute_alpha16avx2(llr_t* alpha,llr_t* beta,llr_t* m_11,llr_t* m_10,uint16
if (rerun_flag==1) if (rerun_flag==1)
break; break;
} }
timeout = rdtsc_oai();
printf("alpha: inner loop time %llu\n",timeout-timein);
} }
...@@ -386,9 +393,10 @@ void compute_beta16avx2(llr_t* alpha,llr_t* beta,llr_t *m_11,llr_t* m_10,uint16_ ...@@ -386,9 +393,10 @@ void compute_beta16avx2(llr_t* alpha,llr_t* beta,llr_t *m_11,llr_t* m_10,uint16_
int k,rerun_flag=0; int k,rerun_flag=0;
__m256i m11_128,m10_128; __m256i *m11p,*m10p;
__m256i m_b0,m_b1,m_b2,m_b3,m_b4,m_b5,m_b6,m_b7; register __m256i b0,b1,b2,b3,b4,b5,b6,b7;
__m256i new0,new1,new2,new3,new4,new5,new6,new7; register __m256i m_b0,m_b1,m_b2,m_b3,m_b4,m_b5,m_b6,m_b7;
register __m256i new0,new1,new2,new3,new4,new5,new6,new7;
__m256i *beta128,*alpha128,*beta_ptr; __m256i *beta128,*alpha128,*beta_ptr;
__m256i beta_max; __m256i beta_max;
...@@ -398,6 +406,8 @@ void compute_beta16avx2(llr_t* alpha,llr_t* beta,llr_t *m_11,llr_t* m_10,uint16_ ...@@ -398,6 +406,8 @@ void compute_beta16avx2(llr_t* alpha,llr_t* beta,llr_t *m_11,llr_t* m_10,uint16_
llr_t beta0,beta1; llr_t beta0,beta1;
llr_t beta0_cw2,beta1_cw2; llr_t beta0_cw2,beta1_cw2;
unsigned long long timein,timeout;
#ifdef DEBUG_LOGMAP #ifdef DEBUG_LOGMAP
fprintf(fdavx2,"compute_beta (avx2_16bit), %p,%p,%p,%p,framelength %d,F %d\n", fprintf(fdavx2,"compute_beta (avx2_16bit), %p,%p,%p,%p,framelength %d,F %d\n",
beta,m_11,m_10,alpha,frame_length,F); beta,m_11,m_10,alpha,frame_length,F);
...@@ -590,56 +600,74 @@ void compute_beta16avx2(llr_t* alpha,llr_t* beta,llr_t *m_11,llr_t* m_10,uint16_ ...@@ -590,56 +600,74 @@ void compute_beta16avx2(llr_t* alpha,llr_t* beta,llr_t *m_11,llr_t* m_10,uint16_
#endif #endif
int loopval=((rerun_flag==0)?0:((frame_length-L)>>3)); int loopval=((rerun_flag==0)?0:((frame_length-L)>>3));
printf("beta: rerun %d => loopval %d\n",rerun_flag,loopval);
timein = rdtsc_oai();
m11p = (frame_length>>3)-1+(__m256i*)m_11;
m10p = (frame_length>>3)-1+(__m256i*)m_10;
for (k=(frame_length>>3)-1; k>=loopval; k--) { for (k=(frame_length>>3)-1; k>=loopval; k--) {
m11_128=((__m256i*)m_11)[k];
m10_128=((__m256i*)m_10)[k];
m_b0 = _mm256_adds_epi16(beta_ptr[4],m11_128); //m11
m_b1 = _mm256_subs_epi16(beta_ptr[4],m11_128); //m00
m_b2 = _mm256_subs_epi16(beta_ptr[5],m10_128); //m01
m_b3 = _mm256_adds_epi16(beta_ptr[5],m10_128); //m10
m_b4 = _mm256_adds_epi16(beta_ptr[6],m10_128); //m10
m_b5 = _mm256_subs_epi16(beta_ptr[6],m10_128); //m01
m_b6 = _mm256_subs_epi16(beta_ptr[7],m11_128); //m00
m_b7 = _mm256_adds_epi16(beta_ptr[7],m11_128); //m11
new0 = _mm256_subs_epi16(beta_ptr[0],m11_128); //m00
new1 = _mm256_adds_epi16(beta_ptr[0],m11_128); //m11
new2 = _mm256_adds_epi16(beta_ptr[1],m10_128); //m10
new3 = _mm256_subs_epi16(beta_ptr[1],m10_128); //m01
new4 = _mm256_subs_epi16(beta_ptr[2],m10_128); //m01
new5 = _mm256_adds_epi16(beta_ptr[2],m10_128); //m10
new6 = _mm256_adds_epi16(beta_ptr[3],m11_128); //m11
new7 = _mm256_subs_epi16(beta_ptr[3],m11_128); //m00
beta_ptr-=8; b4 = _mm256_load_si256(&beta_ptr[4]);
b5 = _mm256_load_si256(&beta_ptr[5]);
b6 = _mm256_load_si256(&beta_ptr[6]);
b7 = _mm256_load_si256(&beta_ptr[7]);
m_b0 = _mm256_adds_epi16(b4,*m11p); //m11
m_b1 = _mm256_subs_epi16(b4,*m11p); //m00
m_b2 = _mm256_subs_epi16(b5,*m10p); //m01
m_b3 = _mm256_adds_epi16(b5,*m10p); //m10
m_b4 = _mm256_adds_epi16(b6,*m10p); //m10
m_b5 = _mm256_subs_epi16(b6,*m10p); //m01
m_b6 = _mm256_subs_epi16(b7,*m11p); //m00
m_b7 = _mm256_adds_epi16(b7,*m11p); //m11
b0 = _mm256_load_si256(&beta_ptr[0]);
b1 = _mm256_load_si256(&beta_ptr[1]);
b2 = _mm256_load_si256(&beta_ptr[2]);
b3 = _mm256_load_si256(&beta_ptr[3]);
new0 = _mm256_subs_epi16(b0,*m11p); //m00
new1 = _mm256_adds_epi16(b0,*m11p); //m11
new2 = _mm256_adds_epi16(b1,*m10p); //m10
new3 = _mm256_subs_epi16(b1,*m10p); //m01
new4 = _mm256_subs_epi16(b2,*m10p); //m01
new5 = _mm256_adds_epi16(b2,*m10p); //m10
new6 = _mm256_adds_epi16(b3,*m11p); //m11
new7 = _mm256_subs_epi16(b3,*m11p); //m00
b0 = _mm256_max_epi16(m_b0,new0);
b1 = _mm256_max_epi16(m_b1,new1);
b2 = _mm256_max_epi16(m_b2,new2);
b3 = _mm256_max_epi16(m_b3,new3);
b4 = _mm256_max_epi16(m_b4,new4);
b5 = _mm256_max_epi16(m_b5,new5);
b6 = _mm256_max_epi16(m_b6,new6);
b7 = _mm256_max_epi16(m_b7,new7);
beta_max = _mm256_max_epi16(b0,b1);
beta_max = _mm256_max_epi16(beta_max ,b2);
beta_max = _mm256_max_epi16(beta_max ,b3);
beta_max = _mm256_max_epi16(beta_max ,b4);
beta_max = _mm256_max_epi16(beta_max ,b5);
beta_max = _mm256_max_epi16(beta_max ,b6);
beta_max = _mm256_max_epi16(beta_max ,b7);
beta_ptr[0] = _mm256_max_epi16(m_b0,new0); beta_ptr-=8;
beta_ptr[1] = _mm256_max_epi16(m_b1,new1); m11p--;
beta_ptr[2] = _mm256_max_epi16(m_b2,new2); m10p--;
beta_ptr[3] = _mm256_max_epi16(m_b3,new3);
beta_ptr[4] = _mm256_max_epi16(m_b4,new4); beta_ptr[0] = _mm256_subs_epi16(b0,beta_max);
beta_ptr[5] = _mm256_max_epi16(m_b5,new5); beta_ptr[1] = _mm256_subs_epi16(b1,beta_max);
beta_ptr[6] = _mm256_max_epi16(m_b6,new6); beta_ptr[2] = _mm256_subs_epi16(b2,beta_max);
beta_ptr[7] = _mm256_max_epi16(m_b7,new7); beta_ptr[3] = _mm256_subs_epi16(b3,beta_max);
beta_ptr[4] = _mm256_subs_epi16(b4,beta_max);
beta_max = _mm256_max_epi16(beta_ptr[0],beta_ptr[1]); beta_ptr[5] = _mm256_subs_epi16(b5,beta_max);
beta_max = _mm256_max_epi16(beta_max ,beta_ptr[2]); beta_ptr[6] = _mm256_subs_epi16(b6,beta_max);
beta_max = _mm256_max_epi16(beta_max ,beta_ptr[3]); beta_ptr[7] = _mm256_subs_epi16(b7,beta_max);
beta_max = _mm256_max_epi16(beta_max ,beta_ptr[4]);
beta_max = _mm256_max_epi16(beta_max ,beta_ptr[5]);
beta_max = _mm256_max_epi16(beta_max ,beta_ptr[6]);
beta_max = _mm256_max_epi16(beta_max ,beta_ptr[7]);
beta_ptr[0] = _mm256_subs_epi16(beta_ptr[0],beta_max);
beta_ptr[1] = _mm256_subs_epi16(beta_ptr[1],beta_max);
beta_ptr[2] = _mm256_subs_epi16(beta_ptr[2],beta_max);
beta_ptr[3] = _mm256_subs_epi16(beta_ptr[3],beta_max);
beta_ptr[4] = _mm256_subs_epi16(beta_ptr[4],beta_max);
beta_ptr[5] = _mm256_subs_epi16(beta_ptr[5],beta_max);
beta_ptr[6] = _mm256_subs_epi16(beta_ptr[6],beta_max);
beta_ptr[7] = _mm256_subs_epi16(beta_ptr[7],beta_max);
#ifdef DEBUG_LOGMAP #ifdef DEBUG_LOGMAP
fprintf(fdavx2,"Loop index %d, mb\n",k); fprintf(fdavx2,"Loop index %d, mb\n",k);
...@@ -658,6 +686,8 @@ void compute_beta16avx2(llr_t* alpha,llr_t* beta,llr_t *m_11,llr_t* m_10,uint16_ ...@@ -658,6 +686,8 @@ void compute_beta16avx2(llr_t* alpha,llr_t* beta,llr_t *m_11,llr_t* m_10,uint16_
#endif #endif
} }
timeout = rdtsc_oai();
printf("beta: inner loop time %llu\n",timeout-timein);
if (rerun_flag==1) if (rerun_flag==1)
break; break;
...@@ -968,7 +998,7 @@ unsigned char phy_threegpplte_turbo_decoder16avx2(int16_t *y, ...@@ -968,7 +998,7 @@ unsigned char phy_threegpplte_turbo_decoder16avx2(int16_t *y,
yp2 = yparity2; yp2 = yparity2;
#if 0
for (i=0; i<n; i+=8) { for (i=0; i<n; i+=8) {
pi2_p = &pi2tab16avx2[iind][i]; pi2_p = &pi2tab16avx2[iind][i];
...@@ -1084,9 +1114,23 @@ unsigned char phy_threegpplte_turbo_decoder16avx2(int16_t *y, ...@@ -1084,9 +1114,23 @@ unsigned char phy_threegpplte_turbo_decoder16avx2(int16_t *y,
yp128_cw2+=3; yp128_cw2+=3;
} }
yp=(llr_t*)yp128; yp=(llr_t*)yp128;
yp_cw2=(llr_t*)yp128_cw2; yp_cw2=(llr_t*)yp128_cw2;
#else
pi2_p = &pi2tab16avx2[iind][0];
for (i=0,j=0; i<n; i++) {
s[*pi2_p] = y[j];
s[*pi2_p+8] = y2[j++];
yp1[*pi2_p] = y[j];
yp1[*pi2_p+8] = y2[j++];
yp2[*pi2_p] = y[j];
yp2[(*pi2_p++)+8] = y2[j++];
}
yp=(llr_t*)&y[j];
yp_cw2=(llr_t*)&y2[j];
#endif
// Termination // Termination
for (i=0; i<3; i++) { for (i=0; i<3; i++) {
......
...@@ -64,18 +64,15 @@ void lte_gold(LTE_DL_FRAME_PARMS *frame_parms,uint32_t lte_gold_table[20][2][14] ...@@ -64,18 +64,15 @@ void lte_gold(LTE_DL_FRAME_PARMS *frame_parms,uint32_t lte_gold_table[20][2][14]
(((1+(Nid_cell<<1))*(1 + (((frame_parms->Ncp==0)?4:3)*l) + (7*(1+ns))))<<10); //cinit (((1+(Nid_cell<<1))*(1 + (((frame_parms->Ncp==0)?4:3)*l) + (7*(1+ns))))<<10); //cinit
//x2 = frame_parms->Ncp + (Nid_cell<<1) + (1+(Nid_cell<<1))*(1 + (3*l) + (7*(1+ns))); //cinit //x2 = frame_parms->Ncp + (Nid_cell<<1) + (1+(Nid_cell<<1))*(1 + (3*l) + (7*(1+ns))); //cinit
//n = 0 //n = 0
// printf("cinit (ns %d, l %d) => %d\n",ns,l,x2);
x1 = 1+ (1<<31); x1 = 1+ (1<<31);
x2=x2 ^ ((x2 ^ (x2>>1) ^ (x2>>2) ^ (x2>>3))<<31); x2=x2 ^ ((x2 ^ (x2>>1) ^ (x2>>2) ^ (x2>>3))<<31);
// skip first 50 double words (1600 bits) // skip first 50 double words (1600 bits)
//printf("n=0 : x1 %x, x2 %x\n",x1,x2);
for (n=1; n<50; n++) { for (n=1; n<50; n++) {
x1 = (x1>>1) ^ (x1>>4); x1 = (x1>>1) ^ (x1>>4);
x1 = x1 ^ (x1<<31) ^ (x1<<28); x1 = x1 ^ (x1<<31) ^ (x1<<28);
x2 = (x2>>1) ^ (x2>>2) ^ (x2>>3) ^ (x2>>4); x2 = (x2>>1) ^ (x2>>2) ^ (x2>>3) ^ (x2>>4);
x2 = x2 ^ (x2<<31) ^ (x2<<30) ^ (x2<<29) ^ (x2<<28); x2 = x2 ^ (x2<<31) ^ (x2<<30) ^ (x2<<29) ^ (x2<<28);
// printf("x1 : %x, x2 : %x\n",x1,x2);
} }
for (n=0; n<14; n++) { for (n=0; n<14; n++) {
...@@ -84,7 +81,6 @@ void lte_gold(LTE_DL_FRAME_PARMS *frame_parms,uint32_t lte_gold_table[20][2][14] ...@@ -84,7 +81,6 @@ void lte_gold(LTE_DL_FRAME_PARMS *frame_parms,uint32_t lte_gold_table[20][2][14]
x2 = (x2>>1) ^ (x2>>2) ^ (x2>>3) ^ (x2>>4); x2 = (x2>>1) ^ (x2>>2) ^ (x2>>3) ^ (x2>>4);
x2 = x2 ^ (x2<<31) ^ (x2<<30) ^ (x2<<29) ^ (x2<<28); x2 = x2 ^ (x2<<31) ^ (x2<<30) ^ (x2<<29) ^ (x2<<28);
lte_gold_table[ns][l][n] = x1^x2; lte_gold_table[ns][l][n] = x1^x2;
// printf("n=%d : c %x\n",n,x1^x2);
} }
} }
......
...@@ -446,7 +446,8 @@ uint32_t dlsch_decoding(PHY_VARS_UE *phy_vars_ue, ...@@ -446,7 +446,8 @@ uint32_t dlsch_decoding(PHY_VARS_UE *phy_vars_ue,
printf("\n"); printf("\n");
*/ */
#ifndef __AVX2__ //#ifndef __AVX2__
#if 1
if (err_flag == 0) { if (err_flag == 0) {
start_meas(dlsch_turbo_decoding_stats); start_meas(dlsch_turbo_decoding_stats);
......
...@@ -1898,17 +1898,17 @@ void dlsch_channel_compensation_TM3(LTE_DL_FRAME_PARMS *frame_parms, ...@@ -1898,17 +1898,17 @@ void dlsch_channel_compensation_TM3(LTE_DL_FRAME_PARMS *frame_parms,
for (aarx=0; aarx<frame_parms->nb_antennas_rx; aarx++) { for (aarx=0; aarx<frame_parms->nb_antennas_rx; aarx++) {
dl_ch0_128 = (__m128i *)&dl_ch_estimates_ext[aarx][symbol*frame_parms->N_RB_DL*12]; dl_ch0_128 = (__m128i *)&dl_ch_estimates_ext[aarx][symbol*frame_parms->N_RB_DL*12]; // hr,0
dl_ch1_128 = (__m128i *)&dl_ch_estimates_ext[2+aarx][symbol*frame_parms->N_RB_DL*12]; dl_ch1_128 = (__m128i *)&dl_ch_estimates_ext[2+aarx][symbol*frame_parms->N_RB_DL*12]; // hr,1
dl_ch_mag0_128 = (__m128i *)&dl_ch_mag0[aarx][symbol*frame_parms->N_RB_DL*12]; dl_ch_mag0_128 = (__m128i *)&dl_ch_mag0[aarx][symbol*frame_parms->N_RB_DL*12];
dl_ch_mag0_128b = (__m128i *)&dl_ch_magb0[aarx][symbol*frame_parms->N_RB_DL*12]; dl_ch_mag0_128b = (__m128i *)&dl_ch_magb0[aarx][symbol*frame_parms->N_RB_DL*12];
dl_ch_mag1_128 = (__m128i *)&dl_ch_mag1[aarx][symbol*frame_parms->N_RB_DL*12]; dl_ch_mag1_128 = (__m128i *)&dl_ch_mag1[aarx][symbol*frame_parms->N_RB_DL*12];
dl_ch_mag1_128b = (__m128i *)&dl_ch_magb1[aarx][symbol*frame_parms->N_RB_DL*12]; dl_ch_mag1_128b = (__m128i *)&dl_ch_magb1[aarx][symbol*frame_parms->N_RB_DL*12];
rxdataF128 = (__m128i *)&rxdataF_ext[aarx][symbol*frame_parms->N_RB_DL*12]; rxdataF128 = (__m128i *)&rxdataF_ext[aarx][symbol*frame_parms->N_RB_DL*12]; // yr
rxdataF_comp0_128 = (__m128i *)&rxdataF_comp0[aarx][symbol*frame_parms->N_RB_DL*12]; rxdataF_comp0_128 = (__m128i *)&rxdataF_comp0[aarx][symbol*frame_parms->N_RB_DL*12]; // yr,0 = yr * conj(hr,0)
rxdataF_comp1_128 = (__m128i *)&rxdataF_comp1[aarx][symbol*frame_parms->N_RB_DL*12]; rxdataF_comp1_128 = (__m128i *)&rxdataF_comp1[aarx][symbol*frame_parms->N_RB_DL*12]; // yr,1 = yr * conj(hr,1)
for (rb=0; rb<nb_rb; rb++) { for (rb=0; rb<nb_rb; rb++) {
......
Markdown is supported
0%
or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment