Commit 8d4405bd authored by Raymond Knopp's avatar Raymond Knopp

added avx2 optimized turbo decoder for 16-bit LLR. This decoder parallelizes...

added avx2 optimized turbo decoder for 16-bit LLR.  This decoder parallelizes by decoding 2 code segments concurrently. requires updates dlsch_decoding.c to identify when new parallel version can be used. other minor changes related to memory allocations for future avx2 optimizations (32-byte alignment).
parent 27b1707e
......@@ -134,7 +134,7 @@ else (CMAKE_SYSTEM_PROCESSOR STREQUAL "armv7l")
set(C_FLAGS_PROCESSOR "${C_FLAGS_PROCESSOR} -mavx2")
endif()
if (CPUINFO MATCHES "sse4_2")
set(C_FLAGS_PROCESSOR "${C_FLAGS_PROCESSOR} -msse4.2")
set(C_FLAGS_PROCESSOR "${C_FLAGS_PROCESSOR} -mavx2 -msse4.2")
endif()
if (CPUINFO MATCHES "sse4_1")
set(C_FLAGS_PROCESSOR "${C_FLAGS_PROCESSOR} -msse4.1")
......@@ -168,7 +168,7 @@ set(CMAKE_CXX_FLAGS
# these changes are related to hardcoded path to include .h files
add_definitions(-DCMAKER)
set(CMAKE_C_FLAGS_DEBUG "${CMAKE_C_FLAGS} -g -DMALLOC_CHECK_=3")
set(CMAKE_C_FLAGS_RELWITHDEBINFO "${CMAKE_C_FLAGS} -g -DMALLOC_CHECK_=3 -O2")
set(CMAKE_C_FLAGS_RELWITHDEBINFO "${CMAKE_C_FLAGS} -g -DMALLOC_CHECK_=3 -O3")
# Below has been put in comment because does not work with
# SVN authentication.
......@@ -840,6 +840,7 @@ set(PHY_SRC
${OPENAIR1_DIR}/PHY/CODING/crc_byte.c
${OPENAIR1_DIR}/PHY/CODING/3gpplte_turbo_decoder_sse_8bit.c
${OPENAIR1_DIR}/PHY/CODING/3gpplte_turbo_decoder_sse_16bit.c
${OPENAIR1_DIR}/PHY/CODING/3gpplte_turbo_decoder_avx2_16bit.c
${OPENAIR1_DIR}/PHY/CODING/lte_rate_matching.c
${OPENAIR1_DIR}/PHY/CODING/rate_matching.c
${OPENAIR1_DIR}/PHY/CODING/viterbi.c
......
/*******************************************************************************
OpenAirInterface
Copyright(c) 1999 - 2014 Eurecom
OpenAirInterface is free software: you can redistribute it and/or modify
it under the terms of the GNU General Public License as published by
the Free Software Foundation, either version 3 of the License, or
(at your option) any later version.
OpenAirInterface is distributed in the hope that it will be useful,
but WITHOUT ANY WARRANTY; without even the implied warranty of
MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
GNU General Public License for more details.
You should have received a copy of the GNU General Public License
along with OpenAirInterface.The full GNU General Public License is
included in this distribution in the file called "COPYING". If not,
see <http://www.gnu.org/licenses/>.
Contact Information
OpenAirInterface Admin: openair_admin@eurecom.fr
OpenAirInterface Tech : openair_tech@eurecom.fr
OpenAirInterface Dev : openair4g-devel@lists.eurecom.fr
Address : Eurecom, Campus SophiaTech, 450 Route des Chappes, CS 50193 - 06904 Biot Sophia Antipolis cedex, FRANCE
*******************************************************************************/
/* file: 3gpplte_turbo_decoder_sse_16bit.c
purpose: Routines for implementing max-logmap decoding of Turbo-coded (DLSCH) transport channels from 36-212, V8.6 2009-03
authors: raymond.knopp@eurecom.fr, Laurent Thomas (Alcatel-Lucent)
date: 21.10.2009
Note: This version of the routine currently requires SSE2,SSSE3 and SSE4.1 equipped computers. It uses 16-bit inputs for
LLRS and uses 16-bit arithmetic for the internal computations!
Changelog: 17.11.2009 FK SSE4.1 not required anymore
Aug. 2012 new parallelization options for higher speed (8-way parallelization)
Jan. 2013 8-bit LLR support with 16-way parallelization
Feb. 2013 New interleaving and hard-decision optimizations (L. Thomas)
May 2013 Extracted 16bit code
*/
///
///
#ifdef __AVX2__
#include "PHY/sse_intrin.h"
#ifndef TEST_DEBUG
#include "PHY/defs.h"
#include "PHY/CODING/defs.h"
#include "PHY/CODING/lte_interleaver_inline.h"
#include "extern_3GPPinterleaver.h"
#else
#include "defs.h"
#include <stdio.h>
#include <stdlib.h>
#include <string.h>
#endif
#ifdef MEX
#include "mex.h"
#endif
//#define DEBUG_LOGMAP
#ifdef DEBUG_LOGMAP
#define print_shorts(s,x) fprintf(fdavx2,"%s %d,%d,%d,%d,%d,%d,%d,%d\n",s,(x)[0],(x)[1],(x)[2],(x)[3],(x)[4],(x)[5],(x)[6],(x)[7]);fprintf(fdavx2b,"%s %d,%d,%d,%d,%d,%d,%d,%d\n",s,(x)[8],(x)[9],(x)[10],(x)[11],(x)[12],(x)[13],(x)[14],(x)[15])
FILE *fdavx2,*fdavx2b;
#else
#endif
#define print_bytes(s,x) printf("%s %d,%d,%d,%d,%d,%d,%d,%d,%d,%d,%d,%d,%d,%d,%d,%d,%d,%d,%d,%d,%d,%d,%d,%d,%d,%d,%d,%d,%d,%d,%d,%d\n",s,(x)[0],(x)[1],(x)[2],(x)[3],(x)[4],(x)[5],(x)[6],(x)[7],(x)[8],(x)[9],(x)[10],(x)[11],(x)[12],(x)[13],(x)[14],(x)[15],(x)[16],(x)[17],(x)[18],(x)[19],(x)[20],(x)[21],(x)[22],(x)[23],(x)[24],(x)[25],(x)[26],(x)[27],(x)[28],(x)[29],(x)[30],(x)[31])
typedef int16_t llr_t; // internal decoder LLR data is 16-bit fixed
typedef int16_t channel_t;
#define MAX 256
void log_map16avx2(llr_t* systematic,channel_t* y_parity, llr_t* m11, llr_t* m10, llr_t *alpha, llr_t *beta, llr_t* ext,uint16_t frame_length,unsigned char term_flag,unsigned char F,int offset8_flag,time_stats_t *alpha_stats,time_stats_t *beta_stats,time_stats_t *gamma_stats,time_stats_t *ext_stats);
void compute_gamma16avx2(llr_t* m11,llr_t* m10,llr_t* systematic, channel_t* y_parity, uint16_t frame_length,unsigned char term_flag);
void compute_alpha16avx2(llr_t*alpha,llr_t *beta, llr_t* m11,llr_t* m10, uint16_t frame_length,unsigned char F);
void compute_beta16avx2(llr_t*alpha, llr_t* beta,llr_t* m11,llr_t* m10, uint16_t frame_length,unsigned char F,int offset8_flag);
void compute_ext16avx2(llr_t* alpha,llr_t* beta,llr_t* m11,llr_t* m10,llr_t* extrinsic, llr_t* ap, uint16_t frame_length);
void log_map16avx2(llr_t* systematic,
channel_t* y_parity,
llr_t* m11,
llr_t* m10,
llr_t *alpha,
llr_t *beta,
llr_t* ext,
uint16_t frame_length,
unsigned char term_flag,
unsigned char F,
int offset8_flag,
time_stats_t *alpha_stats,
time_stats_t *beta_stats,
time_stats_t *gamma_stats,
time_stats_t *ext_stats)
{
#ifdef DEBUG_LOGMAP
fprintf(fdavx2,"log_map (avx2_16bit), frame_length %d\n",frame_length);
fprintf(fdavx2b,"log_map (avx2_16bit), frame_length %d\n",frame_length);
#endif
start_meas(gamma_stats) ;
compute_gamma16avx2(m11,m10,systematic,y_parity,frame_length,term_flag) ;
stop_meas(gamma_stats);
start_meas(alpha_stats) ;
compute_alpha16avx2(alpha,beta,m11,m10,frame_length,F) ;
stop_meas(alpha_stats);
start_meas(beta_stats) ;
compute_beta16avx2(alpha,beta,m11,m10,frame_length,F,offset8_flag) ;
stop_meas(beta_stats);
start_meas(ext_stats) ;
compute_ext16avx2(alpha,beta,m11,m10,ext,systematic,frame_length) ;
stop_meas(ext_stats);
}
void compute_gamma16avx2(llr_t* m11,llr_t* m10,llr_t* systematic,channel_t* y_parity,
uint16_t frame_length,unsigned char term_flag)
{
int k,K1;
__m256i *systematic128 = (__m256i *)systematic;
__m256i *y_parity128 = (__m256i *)y_parity;
__m256i *m10_128 = (__m256i *)m10;
__m256i *m11_128 = (__m256i *)m11;
#ifdef DEBUG_LOGMAP
fprintf(fdavx2,"compute_gamma (avx2_16bit), %p,%p,%p,%p,framelength %d\n",m11,m10,systematic,y_parity,frame_length);
fprintf(fdavx2b,"compute_gamma (avx2_16bit), %p,%p,%p,%p,framelength %d\n",m11,m10,systematic,y_parity,frame_length);
#endif
K1=frame_length>>3;
for (k=0; k<K1; k++) {
m11_128[k] = _mm256_srai_epi16(_mm256_adds_epi16(systematic128[k],y_parity128[k]),1);
m10_128[k] = _mm256_srai_epi16(_mm256_subs_epi16(systematic128[k],y_parity128[k]),1);
#ifdef DEBUG_LOGMAP
fprintf(fdavx2,"Loop index k %d\n",k);
fprintf(fdavx2b,"Loop index k %d\n",k);
print_shorts("sys",(int16_t*)&systematic128[k]);
print_shorts("yp",(int16_t*)&y_parity128[k]);
print_shorts("m11",(int16_t*)&m11_128[k]);
print_shorts("m10",(int16_t*)&m10_128[k]);
#endif
}
// Termination
m11_128[k] = _mm256_srai_epi16(_mm256_adds_epi16(systematic128[k+term_flag],y_parity128[k]),1);
m10_128[k] = _mm256_srai_epi16(_mm256_subs_epi16(systematic128[k+term_flag],y_parity128[k]),1);
#ifdef DEBUG_LOGMAP
fprintf(fdavx2,"Loop index k %d (term flag %d)\n",k,term_flag);
fprintf(fdavx2b,"Loop index k %d (term flag %d)\n",k,term_flag);
print_shorts("sys",(int16_t*)&systematic128[k+term_flag]);
print_shorts("yp",(int16_t*)&y_parity128[k]);
print_shorts("m11",(int16_t*)&m11_128[k]);
print_shorts("m10",(int16_t*)&m10_128[k]);
#endif
}
#define L 40
void compute_alpha16avx2(llr_t* alpha,llr_t* beta,llr_t* m_11,llr_t* m_10,uint16_t frame_length,unsigned char F)
{
int k,l,l2,K1,rerun_flag=0;
__m256i *alpha128=(__m256i *)alpha,*alpha_ptr;
__m256i a0,a1,a2,a3,a4,a5,a6,a7,*m11p,*m10p;
__m256i m_b0,m_b1,m_b2,m_b3,m_b4,m_b5,m_b6,m_b7;
__m256i new0,new1,new2,new3,new4,new5,new6,new7;
__m256i alpha_max;
l2 = L>>3;
K1 = (frame_length>>3);
#ifdef DEBUG_LOGMAP
fprintf(fdavx2,"Compute alpha (avx2_16bit)\n");
fprintf(fdavx2b,"Compute alpha (avx2_16bit)\n");
#endif
for (l=K1;; l=l2,rerun_flag=1) {
alpha128 = (__m256i *)alpha;
if (rerun_flag == 0) {
alpha128[0] = _mm256_set_epi16(-MAX/2,-MAX/2,-MAX/2,-MAX/2,-MAX/2,-MAX/2,-MAX/2,0,-MAX/2,-MAX/2,-MAX/2,-MAX/2,-MAX/2,-MAX/2,-MAX/2,0);
alpha128[1] = _mm256_set_epi16(-MAX/2,-MAX/2,-MAX/2,-MAX/2,-MAX/2,-MAX/2,-MAX/2,-MAX/2,-MAX/2,-MAX/2,-MAX/2,-MAX/2,-MAX/2,-MAX/2,-MAX/2,-MAX/2);
alpha128[2] = _mm256_set_epi16(-MAX/2,-MAX/2,-MAX/2,-MAX/2,-MAX/2,-MAX/2,-MAX/2,-MAX/2,-MAX/2,-MAX/2,-MAX/2,-MAX/2,-MAX/2,-MAX/2,-MAX/2,-MAX/2);
alpha128[3] = _mm256_set_epi16(-MAX/2,-MAX/2,-MAX/2,-MAX/2,-MAX/2,-MAX/2,-MAX/2,-MAX/2,-MAX/2,-MAX/2,-MAX/2,-MAX/2,-MAX/2,-MAX/2,-MAX/2,-MAX/2);
alpha128[4] = _mm256_set_epi16(-MAX/2,-MAX/2,-MAX/2,-MAX/2,-MAX/2,-MAX/2,-MAX/2,-MAX/2,-MAX/2,-MAX/2,-MAX/2,-MAX/2,-MAX/2,-MAX/2,-MAX/2,-MAX/2);
alpha128[5] = _mm256_set_epi16(-MAX/2,-MAX/2,-MAX/2,-MAX/2,-MAX/2,-MAX/2,-MAX/2,-MAX/2,-MAX/2,-MAX/2,-MAX/2,-MAX/2,-MAX/2,-MAX/2,-MAX/2,-MAX/2);
alpha128[6] = _mm256_set_epi16(-MAX/2,-MAX/2,-MAX/2,-MAX/2,-MAX/2,-MAX/2,-MAX/2,-MAX/2,-MAX/2,-MAX/2,-MAX/2,-MAX/2,-MAX/2,-MAX/2,-MAX/2,-MAX/2);
alpha128[7] = _mm256_set_epi16(-MAX/2,-MAX/2,-MAX/2,-MAX/2,-MAX/2,-MAX/2,-MAX/2,-MAX/2,-MAX/2,-MAX/2,-MAX/2,-MAX/2,-MAX/2,-MAX/2,-MAX/2,-MAX/2);
#ifdef DEBUG_LOGMAP
fprintf(fdavx2,"Initial alpha\n");
fprintf(fdavx2b,"Initial alpha\n");
print_shorts("a0",(int16_t*)&alpha128[0]);
print_shorts("a1",(int16_t*)&alpha128[1]);
print_shorts("a2",(int16_t*)&alpha128[2]);
print_shorts("a3",(int16_t*)&alpha128[3]);
print_shorts("a4",(int16_t*)&alpha128[4]);
print_shorts("a5",(int16_t*)&alpha128[5]);
print_shorts("a6",(int16_t*)&alpha128[6]);
print_shorts("a7",(int16_t*)&alpha128[7]);
#endif
} else {
//set initial alpha in columns 1-7 from final alpha from last run in columns 0-6
alpha128[0] = _mm256_slli_si256(alpha128[frame_length],2);
alpha128[1] = _mm256_slli_si256(alpha128[1+frame_length],2);
alpha128[2] = _mm256_slli_si256(alpha128[2+frame_length],2);
alpha128[3] = _mm256_slli_si256(alpha128[3+frame_length],2);
alpha128[4] = _mm256_slli_si256(alpha128[4+frame_length],2);
alpha128[5] = _mm256_slli_si256(alpha128[5+frame_length],2);
alpha128[6] = _mm256_slli_si256(alpha128[6+frame_length],2);
alpha128[7] = _mm256_slli_si256(alpha128[7+frame_length],2);
// set initial alpha in column 0 to (0,-MAX/2,...,-MAX/2)
alpha[16] = -MAX/2;
alpha[32] = -MAX/2;
alpha[48] = -MAX/2;
alpha[64] = -MAX/2;
alpha[80] = -MAX/2;
alpha[96] = -MAX/2;
alpha[112] = -MAX/2;
alpha[24] = -MAX/2;
alpha[40] = -MAX/2;
alpha[56] = -MAX/2;
alpha[72] = -MAX/2;
alpha[88] = -MAX/2;
alpha[104] = -MAX/2;
alpha[120] = -MAX/2;
#ifdef DEBUG_LOGMAP
fprintf(fdavx2,"Second run\n");
fprintf(fdavx2b,"Second run\n");
print_shorts("a0",(int16_t*)&alpha128[0]);
print_shorts("a1",(int16_t*)&alpha128[1]);
print_shorts("a2",(int16_t*)&alpha128[2]);
print_shorts("a3",(int16_t*)&alpha128[3]);
print_shorts("a4",(int16_t*)&alpha128[4]);
print_shorts("a5",(int16_t*)&alpha128[5]);
print_shorts("a6",(int16_t*)&alpha128[6]);
print_shorts("a7",(int16_t*)&alpha128[7]);
#endif
}
alpha_ptr = &alpha128[0];
m11p = (__m256i*)m_11;
m10p = (__m256i*)m_10;
for (k=0;
k<l;
k++) {
a1=_mm256_load_si256(&alpha_ptr[1]);
a3=_mm256_load_si256(&alpha_ptr[3]);
a5=_mm256_load_si256(&alpha_ptr[5]);
a7=_mm256_load_si256(&alpha_ptr[7]);
m_b0 = _mm256_adds_epi16(a1,*m11p); // m11
m_b4 = _mm256_subs_epi16(a1,*m11p); // m00=-m11
m_b1 = _mm256_subs_epi16(a3,*m10p); // m01=-m10
m_b5 = _mm256_adds_epi16(a3,*m10p); // m10
m_b2 = _mm256_adds_epi16(a5,*m10p); // m10
m_b6 = _mm256_subs_epi16(a5,*m10p); // m01=-m10
m_b3 = _mm256_subs_epi16(a7,*m11p); // m00=-m11
m_b7 = _mm256_adds_epi16(a7,*m11p); // m11
a0=_mm256_load_si256(&alpha_ptr[0]);
a2=_mm256_load_si256(&alpha_ptr[2]);
a4=_mm256_load_si256(&alpha_ptr[4]);
a6=_mm256_load_si256(&alpha_ptr[6]);
new0 = _mm256_subs_epi16(a0,*m11p); // m00=-m11
new4 = _mm256_adds_epi16(a0,*m11p); // m11
new1 = _mm256_adds_epi16(a2,*m10p); // m10
new5 = _mm256_subs_epi16(a2,*m10p); // m01=-m10
new2 = _mm256_subs_epi16(a4,*m10p); // m01=-m10
new6 = _mm256_adds_epi16(a4,*m10p); // m10
new3 = _mm256_adds_epi16(a6,*m11p); // m11
new7 = _mm256_subs_epi16(a6,*m11p); // m00=-m11
a0 = _mm256_max_epi16(m_b0,new0);
a1 = _mm256_max_epi16(m_b1,new1);
a2 = _mm256_max_epi16(m_b2,new2);
a3 = _mm256_max_epi16(m_b3,new3);
a4 = _mm256_max_epi16(m_b4,new4);
a5 = _mm256_max_epi16(m_b5,new5);
a6 = _mm256_max_epi16(m_b6,new6);
a7 = _mm256_max_epi16(m_b7,new7);
alpha_max = _mm256_max_epi16(a0,a1);
alpha_max = _mm256_max_epi16(alpha_max,a2);
alpha_max = _mm256_max_epi16(alpha_max,a3);
alpha_max = _mm256_max_epi16(alpha_max,a4);
alpha_max = _mm256_max_epi16(alpha_max,a5);
alpha_max = _mm256_max_epi16(alpha_max,a6);
alpha_max = _mm256_max_epi16(alpha_max,a7);
alpha_ptr+=8;
m11p++;
m10p++;
alpha_ptr[0] = _mm256_subs_epi16(a0,alpha_max);
alpha_ptr[1] = _mm256_subs_epi16(a1,alpha_max);
alpha_ptr[2] = _mm256_subs_epi16(a2,alpha_max);
alpha_ptr[3] = _mm256_subs_epi16(a3,alpha_max);
alpha_ptr[4] = _mm256_subs_epi16(a4,alpha_max);
alpha_ptr[5] = _mm256_subs_epi16(a5,alpha_max);
alpha_ptr[6] = _mm256_subs_epi16(a6,alpha_max);
alpha_ptr[7] = _mm256_subs_epi16(a7,alpha_max);
#ifdef DEBUG_LOGMAP
fprintf(fdavx2,"Loop index %d\n",k);
fprintf(fdavx2b,"Loop index %d\n",k);
print_shorts("mb0",(int16_t*)&m_b0);
print_shorts("mb1",(int16_t*)&m_b1);
print_shorts("mb2",(int16_t*)&m_b2);
print_shorts("mb3",(int16_t*)&m_b3);
print_shorts("mb4",(int16_t*)&m_b4);
print_shorts("mb5",(int16_t*)&m_b5);
print_shorts("mb6",(int16_t*)&m_b6);
print_shorts("mb7",(int16_t*)&m_b7);
fprintf(fdavx2,"Loop index %d, new\n",k);
fprintf(fdavx2b,"Loop index %d, new\n",k);
print_shorts("new0",(int16_t*)&new0);
print_shorts("new1",(int16_t*)&new1);
print_shorts("new2",(int16_t*)&new2);
print_shorts("new3",(int16_t*)&new3);
print_shorts("new4",(int16_t*)&new4);
print_shorts("new5",(int16_t*)&new5);
print_shorts("new6",(int16_t*)&new6);
print_shorts("new7",(int16_t*)&new7);
fprintf(fdavx2,"Loop index %d, after max\n",k);
fprintf(fdavx2b,"Loop index %d, after max\n",k);
print_shorts("a0",(int16_t*)&a0);
print_shorts("a1",(int16_t*)&a1);
print_shorts("a2",(int16_t*)&a2);
print_shorts("a3",(int16_t*)&a3);
print_shorts("a4",(int16_t*)&a4);
print_shorts("a5",(int16_t*)&a5);
print_shorts("a6",(int16_t*)&a6);
print_shorts("a7",(int16_t*)&a7);
fprintf(fdavx2,"Loop index %d\n",k);
fprintf(fdavx2b,"Loop index %d\n",k);
print_shorts("a0",(int16_t*)&alpha_ptr[0]);
print_shorts("a1",(int16_t*)&alpha_ptr[1]);
print_shorts("a2",(int16_t*)&alpha_ptr[2]);
print_shorts("a3",(int16_t*)&alpha_ptr[3]);
print_shorts("a4",(int16_t*)&alpha_ptr[4]);
print_shorts("a5",(int16_t*)&alpha_ptr[5]);
print_shorts("a6",(int16_t*)&alpha_ptr[6]);
print_shorts("a7",(int16_t*)&alpha_ptr[7]);
#endif
}
if (rerun_flag==1)
break;
}
}
void compute_beta16avx2(llr_t* alpha,llr_t* beta,llr_t *m_11,llr_t* m_10,uint16_t frame_length,unsigned char F,int offset8_flag)
{
int k,rerun_flag=0;
__m256i m11_128,m10_128;
__m256i m_b0,m_b1,m_b2,m_b3,m_b4,m_b5,m_b6,m_b7;
__m256i new0,new1,new2,new3,new4,new5,new6,new7;
__m256i *beta128,*alpha128,*beta_ptr;
__m256i beta_max;
llr_t m11,m10,beta0_16,beta1_16,beta2_16,beta3_16,beta4_16,beta5_16,beta6_16,beta7_16,beta0_2,beta1_2,beta2_2,beta3_2,beta_m;
llr_t m11_cw2,m10_cw2,beta0_cw2_16,beta1_cw2_16,beta2_cw2_16,beta3_cw2_16,beta4_cw2_16,beta5_cw2_16,beta6_cw2_16,beta7_cw2_16,beta0_2_cw2,beta1_2_cw2,beta2_2_cw2,beta3_2_cw2,beta_m_cw2;
llr_t beta0,beta1;
llr_t beta0_cw2,beta1_cw2;
#ifdef DEBUG_LOGMAP
fprintf(fdavx2,"compute_beta (avx2_16bit), %p,%p,%p,%p,framelength %d,F %d\n",
beta,m_11,m_10,alpha,frame_length,F);
fprintf(fdavx2b,"compute_beta (avx2_16bit), %p,%p,%p,%p,framelength %d,F %d\n",
beta,m_11,m_10,alpha,frame_length,F);
#endif
// termination for beta initialization
// fprintf(fdavx2,"beta init: offset8 %d\n",offset8_flag);
m11=(int16_t)m_11[(frame_length<<1)+2];
m10=(int16_t)m_10[(frame_length<<1)+2];
m11_cw2=(int16_t)m_11[(frame_length<<1)+8+2];
m10_cw2=(int16_t)m_10[(frame_length<<1)+8+2];
#ifdef DEBUG_LOGMAP
fprintf(fdavx2,"m11,m10 %d,%d\n",m11,m10);
fprintf(fdavx2b,"m11,m10 %d,%d\n",m11_cw2,m10_cw2);
#endif
beta0 = -m11;//M0T_TERM;
beta1 = m11;//M1T_TERM;
beta0_cw2 = -m11_cw2;//M0T_TERM;
beta1_cw2 = m11_cw2;//M1T_TERM;
m11=(int16_t)m_11[(frame_length<<1)+1];
m10=(int16_t)m_10[(frame_length<<1)+1];
m11_cw2=(int16_t)m_11[(frame_length<<1)+1+8];
m10_cw2=(int16_t)m_10[(frame_length<<1)+1+8];
#ifdef DEBUG_LOGMAP
fprintf(fdavx2,"m11,m10 %d,%d\n",m11,m10);
fprintf(fdavx2b,"m11,m10 %d,%d\n",m11_cw2,m10_cw2);
#endif
beta0_2 = beta0-m11;//+M0T_TERM;
beta1_2 = beta0+m11;//+M1T_TERM;
beta2_2 = beta1+m10;//M2T_TERM;
beta3_2 = beta1-m10;//+M3T_TERM;
beta0_2_cw2 = beta0_cw2-m11_cw2;//+M0T_TERM;
beta1_2_cw2 = beta0_cw2+m11_cw2;//+M1T_TERM;
beta2_2_cw2 = beta1_cw2+m10_cw2;//M2T_TERM;
beta3_2_cw2 = beta1_cw2-m10_cw2;//+M3T_TERM;
m11=(int16_t)m_11[frame_length<<1];
m10=(int16_t)m_10[frame_length<<1];
m11_cw2=(int16_t)m_11[(frame_length<<1)+8];
m10_cw2=(int16_t)m_10[(frame_length<<1)+8];
#ifdef DEBUG_LOGMAP
fprintf(fdavx2,"m11,m10 %d,%d\n",m11,m10);
fprintf(fdavx2b,"m11,m10 %d,%d\n",m11_cw2,m10_cw2);
#endif
beta0_16 = beta0_2-m11;//+M0T_TERM;
beta1_16 = beta0_2+m11;//+M1T_TERM;
beta2_16 = beta1_2+m10;//+M2T_TERM;
beta3_16 = beta1_2-m10;//+M3T_TERM;
beta4_16 = beta2_2-m10;//+M4T_TERM;
beta5_16 = beta2_2+m10;//+M5T_TERM;
beta6_16 = beta3_2+m11;//+M6T_TERM;
beta7_16 = beta3_2-m11;//+M7T_TERM;
beta0_cw2_16 = beta0_2_cw2-m11_cw2;//+M0T_TERM;
beta1_cw2_16 = beta0_2_cw2+m11_cw2;//+M1T_TERM;
beta2_cw2_16 = beta1_2_cw2+m10_cw2;//+M2T_TERM;
beta3_cw2_16 = beta1_2_cw2-m10_cw2;//+M3T_TERM;
beta4_cw2_16 = beta2_2_cw2-m10_cw2;//+M4T_TERM;
beta5_cw2_16 = beta2_2_cw2+m10_cw2;//+M5T_TERM;
beta6_cw2_16 = beta3_2_cw2+m11_cw2;//+M6T_TERM;
beta7_cw2_16 = beta3_2_cw2-m11_cw2;//+M7T_TERM;
beta_m = (beta0_16>beta1_16) ? beta0_16 : beta1_16;
beta_m = (beta_m>beta2_16) ? beta_m : beta2_16;
beta_m = (beta_m>beta3_16) ? beta_m : beta3_16;
beta_m = (beta_m>beta4_16) ? beta_m : beta4_16;
beta_m = (beta_m>beta5_16) ? beta_m : beta5_16;
beta_m = (beta_m>beta6_16) ? beta_m : beta6_16;
beta_m = (beta_m>beta7_16) ? beta_m : beta7_16;
beta_m_cw2 = (beta0_cw2_16>beta1_cw2_16) ? beta0_cw2_16 : beta1_cw2_16;
beta_m_cw2 = (beta_m_cw2>beta2_cw2_16) ? beta_m_cw2 : beta2_cw2_16;
beta_m_cw2 = (beta_m_cw2>beta3_cw2_16) ? beta_m_cw2 : beta3_cw2_16;
beta_m_cw2 = (beta_m_cw2>beta4_cw2_16) ? beta_m_cw2 : beta4_cw2_16;
beta_m_cw2 = (beta_m_cw2>beta5_cw2_16) ? beta_m_cw2 : beta5_cw2_16;
beta_m_cw2 = (beta_m_cw2>beta6_cw2_16) ? beta_m_cw2 : beta6_cw2_16;
beta_m_cw2 = (beta_m_cw2>beta7_cw2_16) ? beta_m_cw2 : beta7_cw2_16;
beta0_16=beta0_16-beta_m;
beta1_16=beta1_16-beta_m;
beta2_16=beta2_16-beta_m;
beta3_16=beta3_16-beta_m;
beta4_16=beta4_16-beta_m;
beta5_16=beta5_16-beta_m;
beta6_16=beta6_16-beta_m;
beta7_16=beta7_16-beta_m;
beta0_cw2_16=beta0_cw2_16-beta_m_cw2;
beta1_cw2_16=beta1_cw2_16-beta_m_cw2;
beta2_cw2_16=beta2_cw2_16-beta_m_cw2;
beta3_cw2_16=beta3_cw2_16-beta_m_cw2;
beta4_cw2_16=beta4_cw2_16-beta_m_cw2;
beta5_cw2_16=beta5_cw2_16-beta_m_cw2;
beta6_cw2_16=beta6_cw2_16-beta_m_cw2;
beta7_cw2_16=beta7_cw2_16-beta_m_cw2;
for (rerun_flag=0;; rerun_flag=1) {
beta_ptr = (__m256i*)&beta[frame_length<<4];
alpha128 = (__m256i*)&alpha[0];
if (rerun_flag == 0) {
beta_ptr[0] = alpha128[(frame_length)];
beta_ptr[1] = alpha128[1+(frame_length)];
beta_ptr[2] = alpha128[2+(frame_length)];
beta_ptr[3] = alpha128[3+(frame_length)];
beta_ptr[4] = alpha128[4+(frame_length)];
beta_ptr[5] = alpha128[5+(frame_length)];
beta_ptr[6] = alpha128[6+(frame_length)];
beta_ptr[7] = alpha128[7+(frame_length)];
#ifdef DEBUG_LOGMAP
fprintf(fdavx2,"beta init \n");
fprintf(fdavx2b,"beta init \n");
print_shorts("b0",(int16_t*)&beta_ptr[0]);
print_shorts("b1",(int16_t*)&beta_ptr[1]);
print_shorts("b2",(int16_t*)&beta_ptr[2]);
print_shorts("b3",(int16_t*)&beta_ptr[3]);
print_shorts("b4",(int16_t*)&beta_ptr[4]);
print_shorts("b5",(int16_t*)&beta_ptr[5]);
print_shorts("b6",(int16_t*)&beta_ptr[6]);
print_shorts("b7",(int16_t*)&beta_ptr[7]);
#endif
} else {
beta128 = (__m256i*)&beta[0];
beta_ptr[0] = _mm256_srli_si256(beta128[0],2);
beta_ptr[1] = _mm256_srli_si256(beta128[1],2);
beta_ptr[2] = _mm256_srli_si256(beta128[2],2);
beta_ptr[3] = _mm256_srli_si256(beta128[3],2);
beta_ptr[4] = _mm256_srli_si256(beta128[4],2);
beta_ptr[5] = _mm256_srli_si256(beta128[5],2);
beta_ptr[6] = _mm256_srli_si256(beta128[6],2);
beta_ptr[7] = _mm256_srli_si256(beta128[7],2);
#ifdef DEBUG_LOGMAP
fprintf(fdavx2,"beta init (second run)\n");
fprintf(fdavx2b,"beta init (second run)\n");
print_shorts("b0",(int16_t*)&beta_ptr[0]);
print_shorts("b1",(int16_t*)&beta_ptr[1]);
print_shorts("b2",(int16_t*)&beta_ptr[2]);
print_shorts("b3",(int16_t*)&beta_ptr[3]);
print_shorts("b4",(int16_t*)&beta_ptr[4]);
print_shorts("b5",(int16_t*)&beta_ptr[5]);
print_shorts("b6",(int16_t*)&beta_ptr[6]);
print_shorts("b7",(int16_t*)&beta_ptr[7]);
#endif
}
beta_ptr[0] = _mm256_insert_epi16(beta_ptr[0],beta0_16,7);
beta_ptr[1] = _mm256_insert_epi16(beta_ptr[1],beta1_16,7);
beta_ptr[2] = _mm256_insert_epi16(beta_ptr[2],beta2_16,7);
beta_ptr[3] = _mm256_insert_epi16(beta_ptr[3],beta3_16,7);
beta_ptr[4] = _mm256_insert_epi16(beta_ptr[4],beta4_16,7);
beta_ptr[5] = _mm256_insert_epi16(beta_ptr[5],beta5_16,7);
beta_ptr[6] = _mm256_insert_epi16(beta_ptr[6],beta6_16,7);
beta_ptr[7] = _mm256_insert_epi16(beta_ptr[7],beta7_16,7);
beta_ptr[0] = _mm256_insert_epi16(beta_ptr[0],beta0_cw2_16,15);
beta_ptr[1] = _mm256_insert_epi16(beta_ptr[1],beta1_cw2_16,15);
beta_ptr[2] = _mm256_insert_epi16(beta_ptr[2],beta2_cw2_16,15);
beta_ptr[3] = _mm256_insert_epi16(beta_ptr[3],beta3_cw2_16,15);
beta_ptr[4] = _mm256_insert_epi16(beta_ptr[4],beta4_cw2_16,15);
beta_ptr[5] = _mm256_insert_epi16(beta_ptr[5],beta5_cw2_16,15);
beta_ptr[6] = _mm256_insert_epi16(beta_ptr[6],beta6_cw2_16,15);
beta_ptr[7] = _mm256_insert_epi16(beta_ptr[7],beta7_cw2_16,15);
#ifdef DEBUG_LOGMAP
fprintf(fdavx2,"beta init (after insert) \n");
fprintf(fdavx2b,"beta init (after insert) \n");
print_shorts("b0",(int16_t*)&beta_ptr[0]);
print_shorts("b1",(int16_t*)&beta_ptr[1]);
print_shorts("b2",(int16_t*)&beta_ptr[2]);
print_shorts("b3",(int16_t*)&beta_ptr[3]);
print_shorts("b4",(int16_t*)&beta_ptr[4]);
print_shorts("b5",(int16_t*)&beta_ptr[5]);
print_shorts("b6",(int16_t*)&beta_ptr[6]);
print_shorts("b7",(int16_t*)&beta_ptr[7]);
#endif
int loopval=((rerun_flag==0)?0:((frame_length-L)>>3));
for (k=(frame_length>>3)-1; k>=loopval; k--) {
m11_128=((__m256i*)m_11)[k];
m10_128=((__m256i*)m_10)[k];
m_b0 = _mm256_adds_epi16(beta_ptr[4],m11_128); //m11
m_b1 = _mm256_subs_epi16(beta_ptr[4],m11_128); //m00
m_b2 = _mm256_subs_epi16(beta_ptr[5],m10_128); //m01
m_b3 = _mm256_adds_epi16(beta_ptr[5],m10_128); //m10
m_b4 = _mm256_adds_epi16(beta_ptr[6],m10_128); //m10
m_b5 = _mm256_subs_epi16(beta_ptr[6],m10_128); //m01
m_b6 = _mm256_subs_epi16(beta_ptr[7],m11_128); //m00
m_b7 = _mm256_adds_epi16(beta_ptr[7],m11_128); //m11
new0 = _mm256_subs_epi16(beta_ptr[0],m11_128); //m00
new1 = _mm256_adds_epi16(beta_ptr[0],m11_128); //m11
new2 = _mm256_adds_epi16(beta_ptr[1],m10_128); //m10
new3 = _mm256_subs_epi16(beta_ptr[1],m10_128); //m01
new4 = _mm256_subs_epi16(beta_ptr[2],m10_128); //m01
new5 = _mm256_adds_epi16(beta_ptr[2],m10_128); //m10
new6 = _mm256_adds_epi16(beta_ptr[3],m11_128); //m11
new7 = _mm256_subs_epi16(beta_ptr[3],m11_128); //m00
beta_ptr-=8;
beta_ptr[0] = _mm256_max_epi16(m_b0,new0);
beta_ptr[1] = _mm256_max_epi16(m_b1,new1);
beta_ptr[2] = _mm256_max_epi16(m_b2,new2);
beta_ptr[3] = _mm256_max_epi16(m_b3,new3);
beta_ptr[4] = _mm256_max_epi16(m_b4,new4);
beta_ptr[5] = _mm256_max_epi16(m_b5,new5);
beta_ptr[6] = _mm256_max_epi16(m_b6,new6);
beta_ptr[7] = _mm256_max_epi16(m_b7,new7);
beta_max = _mm256_max_epi16(beta_ptr[0],beta_ptr[1]);
beta_max = _mm256_max_epi16(beta_max ,beta_ptr[2]);
beta_max = _mm256_max_epi16(beta_max ,beta_ptr[3]);
beta_max = _mm256_max_epi16(beta_max ,beta_ptr[4]);
beta_max = _mm256_max_epi16(beta_max ,beta_ptr[5]);
beta_max = _mm256_max_epi16(beta_max ,beta_ptr[6]);
beta_max = _mm256_max_epi16(beta_max ,beta_ptr[7]);
beta_ptr[0] = _mm256_subs_epi16(beta_ptr[0],beta_max);
beta_ptr[1] = _mm256_subs_epi16(beta_ptr[1],beta_max);
beta_ptr[2] = _mm256_subs_epi16(beta_ptr[2],beta_max);
beta_ptr[3] = _mm256_subs_epi16(beta_ptr[3],beta_max);
beta_ptr[4] = _mm256_subs_epi16(beta_ptr[4],beta_max);
beta_ptr[5] = _mm256_subs_epi16(beta_ptr[5],beta_max);
beta_ptr[6] = _mm256_subs_epi16(beta_ptr[6],beta_max);
beta_ptr[7] = _mm256_subs_epi16(beta_ptr[7],beta_max);
#ifdef DEBUG_LOGMAP
fprintf(fdavx2,"Loop index %d, mb\n",k);
fprintf(fdavx2,"beta init (after max)\n");
fprintf(fdavx2b,"Loop index %d, mb\n",k);
fprintf(fdavx2b,"beta init (after max)\n");
print_shorts("b0",(int16_t*)&beta_ptr[0]);
print_shorts("b1",(int16_t*)&beta_ptr[1]);
print_shorts("b2",(int16_t*)&beta_ptr[2]);
print_shorts("b3",(int16_t*)&beta_ptr[3]);
print_shorts("b4",(int16_t*)&beta_ptr[4]);
print_shorts("b5",(int16_t*)&beta_ptr[5]);
print_shorts("b6",(int16_t*)&beta_ptr[6]);
print_shorts("b7",(int16_t*)&beta_ptr[7]);
#endif
}
if (rerun_flag==1)
break;
}
}
void compute_ext16avx2(llr_t* alpha,llr_t* beta,llr_t* m_11,llr_t* m_10,llr_t* ext, llr_t* systematic,uint16_t frame_length)
{
__m256i *alpha128=(__m256i *)alpha;
__m256i *beta128=(__m256i *)beta;
__m256i *m11_128,*m10_128,*ext_128;
__m256i *alpha_ptr,*beta_ptr;
__m256i m00_1,m00_2,m00_3,m00_4;
__m256i m01_1,m01_2,m01_3,m01_4;
__m256i m10_1,m10_2,m10_3,m10_4;
__m256i m11_1,m11_2,m11_3,m11_4;
int k;
//
// LLR computation, 8 consequtive bits per loop
//
#ifdef DEBUG_LOGMAP
fprintf(fdavx2,"compute_ext (avx2_16bit), %p, %p, %p, %p, %p, %p ,framelength %d\n",alpha,beta,m_11,m_10,ext,systematic,frame_length);
fprintf(fdavx2b,"compute_ext (avx2_16bit), %p, %p, %p, %p, %p, %p ,framelength %d\n",alpha,beta,m_11,m_10,ext,systematic,frame_length);
#endif
alpha_ptr = alpha128;
beta_ptr = &beta128[8];
for (k=0; k<(frame_length>>3); k++) {
m11_128 = (__m256i*)&m_11[k<<4];
m10_128 = (__m256i*)&m_10[k<<4];
ext_128 = (__m256i*)&ext[k<<4];
/*
fprintf(fdavx2,"EXT %03d\n",k);
print_shorts("a0:",&alpha_ptr[0]);
print_shorts("a1:",&alpha_ptr[1]);
print_shorts("a2:",&alpha_ptr[2]);
print_shorts("a3:",&alpha_ptr[3]);
print_shorts("a4:",&alpha_ptr[4]);
print_shorts("a5:",&alpha_ptr[5]);
print_shorts("a6:",&alpha_ptr[6]);
print_shorts("a7:",&alpha_ptr[7]);
print_shorts("b0:",&beta_ptr[0]);
print_shorts("b1:",&beta_ptr[1]);
print_shorts("b2:",&beta_ptr[2]);
print_shorts("b3:",&beta_ptr[3]);
print_shorts("b4:",&beta_ptr[4]);
print_shorts("b5:",&beta_ptr[5]);
print_shorts("b6:",&beta_ptr[6]);
print_shorts("b7:",&beta_ptr[7]);
*/
m00_4 = _mm256_adds_epi16(alpha_ptr[7],beta_ptr[3]); //ALPHA_BETA_4m00;
m11_4 = _mm256_adds_epi16(alpha_ptr[7],beta_ptr[7]); //ALPHA_BETA_4m11;
m00_3 = _mm256_adds_epi16(alpha_ptr[6],beta_ptr[7]); //ALPHA_BETA_3m00;
m11_3 = _mm256_adds_epi16(alpha_ptr[6],beta_ptr[3]); //ALPHA_BETA_3m11;
m00_2 = _mm256_adds_epi16(alpha_ptr[1],beta_ptr[4]); //ALPHA_BETA_2m00;
m11_2 = _mm256_adds_epi16(alpha_ptr[1],beta_ptr[0]); //ALPHA_BETA_2m11;
m11_1 = _mm256_adds_epi16(alpha_ptr[0],beta_ptr[4]); //ALPHA_BETA_1m11;
m00_1 = _mm256_adds_epi16(alpha_ptr[0],beta_ptr[0]); //ALPHA_BETA_1m00;
m01_4 = _mm256_adds_epi16(alpha_ptr[5],beta_ptr[6]); //ALPHA_BETA_4m01;
m10_4 = _mm256_adds_epi16(alpha_ptr[5],beta_ptr[2]); //ALPHA_BETA_4m10;
m01_3 = _mm256_adds_epi16(alpha_ptr[4],beta_ptr[2]); //ALPHA_BETA_3m01;
m10_3 = _mm256_adds_epi16(alpha_ptr[4],beta_ptr[6]); //ALPHA_BETA_3m10;
m01_2 = _mm256_adds_epi16(alpha_ptr[3],beta_ptr[1]); //ALPHA_BETA_2m01;
m10_2 = _mm256_adds_epi16(alpha_ptr[3],beta_ptr[5]); //ALPHA_BETA_2m10;
m10_1 = _mm256_adds_epi16(alpha_ptr[2],beta_ptr[1]); //ALPHA_BETA_1m10;
m01_1 = _mm256_adds_epi16(alpha_ptr[2],beta_ptr[5]); //ALPHA_BETA_1m01;
/*
print_shorts("m11_1:",&m11_1);
print_shorts("m11_2:",&m11_2);
print_shorts("m11_3:",&m11_3);
print_shorts("m11_4:",&m11_4);
print_shorts("m00_1:",&m00_1);
print_shorts("m00_2:",&m00_2);
print_shorts("m00_3:",&m00_3);
print_shorts("m00_4:",&m00_4);
print_shorts("m10_1:",&m10_1);
print_shorts("m10_2:",&m10_2);
print_shorts("m10_3:",&m10_3);
print_shorts("m10_4:",&m10_4);
print_shorts("m01_1:",&m01_1);
print_shorts("m01_2:",&m01_2);
print_shorts("m01_3:",&m01_3);
print_shorts("m01_4:",&m01_4);
*/
m01_1 = _mm256_max_epi16(m01_1,m01_2);
m01_1 = _mm256_max_epi16(m01_1,m01_3);
m01_1 = _mm256_max_epi16(m01_1,m01_4);
m00_1 = _mm256_max_epi16(m00_1,m00_2);
m00_1 = _mm256_max_epi16(m00_1,m00_3);
m00_1 = _mm256_max_epi16(m00_1,m00_4);
m10_1 = _mm256_max_epi16(m10_1,m10_2);
m10_1 = _mm256_max_epi16(m10_1,m10_3);
m10_1 = _mm256_max_epi16(m10_1,m10_4);
m11_1 = _mm256_max_epi16(m11_1,m11_2);
m11_1 = _mm256_max_epi16(m11_1,m11_3);
m11_1 = _mm256_max_epi16(m11_1,m11_4);
// print_shorts("m11_1:",&m11_1);
m01_1 = _mm256_subs_epi16(m01_1,*m10_128);
m00_1 = _mm256_subs_epi16(m00_1,*m11_128);
m10_1 = _mm256_adds_epi16(m10_1,*m10_128);
m11_1 = _mm256_adds_epi16(m11_1,*m11_128);
// print_shorts("m10_1:",&m10_1);
// print_shorts("m11_1:",&m11_1);
m01_1 = _mm256_max_epi16(m01_1,m00_1);
m10_1 = _mm256_max_epi16(m10_1,m11_1);
// print_shorts("m01_1:",&m01_1);
// print_shorts("m10_1:",&m10_1);
*ext_128 = _mm256_subs_epi16(m10_1,m01_1);
#ifdef DEBUG_LOGMAP
fprintf(fdavx2,"ext %p\n",ext_128);
fprintf(fdavx2b,"ext %p\n",ext_128);
print_shorts("ext:",(int16_t*)ext_128);
print_shorts("m11:",(int16_t*)m11_128);
print_shorts("m10:",(int16_t*)m10_128);
print_shorts("m10_1:",(int16_t*)&m10_1);
print_shorts("m01_1:",(int16_t*)&m01_1);
#endif
alpha_ptr+=8;
beta_ptr+=8;
}
}
//int pi2[n],pi3[n+8],pi5[n+8],pi4[n+8],pi6[n+8],
int *pi2tab16avx2[188],*pi5tab16avx2[188],*pi4tab16avx2[188],*pi6tab16avx2[188];
void free_td16avx2(void)
{
int ind;
for (ind=0; ind<188; ind++) {
free(pi2tab16avx2[ind]);
free(pi5tab16avx2[ind]);
free(pi4tab16avx2[ind]);
free(pi6tab16avx2[ind]);
}
}
void init_td16avx2()
{
int ind,i,i2,i3,j,n,pi,pi2_i,pi2_pi;
short * base_interleaver;
for (ind=0; ind<188; ind++) {
n = f1f2mat[ind].nb_bits;
base_interleaver=il_tb+f1f2mat[ind].beg_index;
#ifdef MEX
// This is needed for the Mex implementation to make the memory persistent
pi2tab16[ind] = mxMalloc((n+8)*sizeof(int));
pi5tab16[ind] = mxMalloc((n+8)*sizeof(int));
pi4tab16[ind] = mxMalloc((n+8)*sizeof(int));
pi6tab16[ind] = mxMalloc((n+8)*sizeof(int));
#else
pi2tab16avx2[ind] = malloc((n+8)*sizeof(int));
pi5tab16avx2[ind] = malloc((n+8)*sizeof(int));
pi4tab16avx2[ind] = malloc((n+8)*sizeof(int));
pi6tab16avx2[ind] = malloc((n+8)*sizeof(int));
#endif
// fprintf(fdavx2,"Interleaver index %d\n",ind);
for (i=i2=0; i2<8; i2++) {
j=i2;
for (i3=0; i3<(n>>3); i3++,i++,j+=8) {
// if (j>=n)
// j-=(n-1);
pi2tab16avx2[ind][i] = ((j>>3)<<4) + (j&7); // 16*floor(j/8) + j mod8, which allows the second codeword to be in pi[i] + 8
// fprintf(fdavx2,"pi2[%d] = %d(%d)\n",i, pi2tab16avx2[ind][i],j);
}
}
for (i=0; i<n; i++) {
pi = base_interleaver[i];//(uint32_t)threegpplte_interleaver(f1,f2,n);
pi2_i = ((pi2tab16avx2[ind][i]>>4)<<3)+(pi2tab16avx2[ind][i]&7);
pi2_pi = ((pi2tab16avx2[ind][pi]>>4)<<3)+(pi2tab16avx2[ind][pi]&7);
pi4tab16avx2[ind][pi2_i] = pi2tab16avx2[ind][pi];
pi5tab16avx2[ind][pi2_pi] = pi2tab16avx2[ind][i];
pi6tab16avx2[ind][pi] = pi2tab16avx2[ind][i];
}
}
}
unsigned char phy_threegpplte_turbo_decoder16avx2(int16_t *y,
int16_t *y2,
uint8_t *decoded_bytes,
uint8_t *decoded_bytes2,
uint16_t n,
uint16_t f1,
uint16_t f2,
uint8_t max_iterations,
uint8_t crc_type,
uint8_t F,
time_stats_t *init_stats,
time_stats_t *alpha_stats,
time_stats_t *beta_stats,
time_stats_t *gamma_stats,
time_stats_t *ext_stats,
time_stats_t *intl1_stats,
time_stats_t *intl2_stats)
{
/* y is a pointer to the input
decoded_bytes is a pointer to the decoded output
n is the size in bits of the coded block, with the tail */
llr_t systematic0[2*(n+16)] __attribute__ ((aligned(32)));
llr_t systematic1[2*(n+16)] __attribute__ ((aligned(32)));
llr_t systematic2[2*(n+16)] __attribute__ ((aligned(32)));
llr_t yparity1[2*(n+16)] __attribute__ ((aligned(32)));
llr_t yparity2[2*(n+16)] __attribute__ ((aligned(32)));
llr_t ext[2*(n+128)] __attribute__((aligned(32)));
llr_t ext2[2*(n+128)] __attribute__((aligned(32)));
llr_t alpha[(n+16)*16] __attribute__ ((aligned(32)));
llr_t beta[(n+16)*16] __attribute__ ((aligned(32)));
llr_t m11[2*(n+16)] __attribute__ ((aligned(32)));
llr_t m10[2*(n+16)] __attribute__ ((aligned(32)));
int *pi2_p,*pi4_p,*pi5_p,*pi6_p;
llr_t *s,*s1,*s2,*yp1,*yp2,*yp,*yp_cw2;
uint32_t i,j,iind;//,pi;
uint8_t iteration_cnt=0;
uint32_t crc,oldcrc,crc_cw2,oldcrc_cw2,crc_len;
uint8_t temp;
uint32_t db;
__m128i *yp128,*yp128_cw2;
__m256i tmp, zeros=_mm256_setzero_si256();
__m128i tmpe,tmpe_cw2;
int offset8_flag=0;
#ifdef DEBUG_LOGMAP
fdavx2 = fopen("dump_avx2.txt","w");
fdavx2b = fopen("dump_avx2b.txt","w");
printf("tc avx2_16 (y,y2) %p,%p\n",y,y2);
#endif
if (crc_type > 3) {
printf("Illegal crc length!\n");
return 255;
}
start_meas(init_stats);
for (iind=0; iind < 188 && f1f2mat[iind].nb_bits != n; iind++);
if ( iind == 188 ) {
printf("Illegal frame length!\n");
return 255;
}
switch (crc_type) {
case CRC24_A:
case CRC24_B:
crc_len=3;
break;
case CRC16:
crc_len=2;
break;
case CRC8:
crc_len=1;
break;
default:
crc_len=3;
}
yp128 = (__m128i*)y;
yp128_cw2 = (__m128i*)y2;
s = systematic0;
s1 = systematic1;
s2 = systematic2;
yp1 = yparity1;
yp2 = yparity2;
for (i=0; i<n; i+=8) {
pi2_p = &pi2tab16avx2[iind][i];
j=pi2_p[0];
tmpe = _mm_load_si128(yp128);
tmpe_cw2 = _mm_load_si128(yp128_cw2);
// fprintf(fdavx2,"yp128 %p\n",yp128);
// print_shorts("tmpe",(int16_t*)&tmpe);
s[j] = _mm_extract_epi16(tmpe,0);
yp1[j] = _mm_extract_epi16(tmpe,1);
yp2[j] = _mm_extract_epi16(tmpe,2);
s[j+8] = _mm_extract_epi16(tmpe_cw2,0);
yp1[j+8] = _mm_extract_epi16(tmpe_cw2,1);
yp2[j+8] = _mm_extract_epi16(tmpe_cw2,2);
#ifdef DEBUG_LOGMAP
fprintf(fdavx2,"init0: j %d, s[j] %d yp1[j] %d yp2[j] %d\n",((j>>4)<<3)+(j&7),s[j],yp1[j],yp2[j]);
fprintf(fdavx2b,"init0: j %d, s[j] %d yp1[j] %d yp2[j] %d\n",((j>>4)<<3)+(j&7),s[j+8],yp1[j+8],yp2[j+8]);
#endif
j=pi2_p[1];
s[j] = _mm_extract_epi16(tmpe,3);
yp1[j] = _mm_extract_epi16(tmpe,4);
yp2[j] = _mm_extract_epi16(tmpe,5);
s[j+8] = _mm_extract_epi16(tmpe_cw2,3);
yp1[j+8] = _mm_extract_epi16(tmpe_cw2,4);
yp2[j+8] = _mm_extract_epi16(tmpe_cw2,5);
#ifdef DEBUG_LOGMAP
fprintf(fdavx2,"init1: j %d, s[j] %d yp1[j] %d yp2[j] %d\n",((j>>4)<<3)+(j&7),s[j],yp1[j],yp2[j]);
fprintf(fdavx2b,"init1: j %d, s[j] %d yp1[j] %d yp2[j] %d\n",((j>>4)<<3)+(j&7),s[j+8],yp1[j+8],yp2[j+8]);
#endif
j=pi2_p[2];
s[j] = _mm_extract_epi16(tmpe,6);
yp1[j] = _mm_extract_epi16(tmpe,7);
tmpe = _mm_load_si128(&yp128[1]);
yp2[j] = _mm_extract_epi16(tmpe,0);
s[j+8] = _mm_extract_epi16(tmpe_cw2,6);
yp1[j+8] = _mm_extract_epi16(tmpe_cw2,7);
tmpe_cw2 = _mm_load_si128(&yp128_cw2[1]);
yp2[j+8] = _mm_extract_epi16(tmpe_cw2,0);
#ifdef DEBUG_LOGMAP
fprintf(fdavx2,"init2: j %d, s[j] %d yp1[j] %d yp2[j] %d\n",((j>>4)<<3)+(j&7),s[j],yp1[j],yp2[j]);
fprintf(fdavx2b,"init2: j %d, s[j] %d yp1[j] %d yp2[j] %d\n",((j>>4)<<3)+(j&7),s[j+8],yp1[j+8],yp2[j+8]);
#endif
j=pi2_p[3];
s[j] = _mm_extract_epi16(tmpe,1);
yp1[j] = _mm_extract_epi16(tmpe,2);
yp2[j] = _mm_extract_epi16(tmpe,3);
s[j+8] = _mm_extract_epi16(tmpe_cw2,1);
yp1[j+8] = _mm_extract_epi16(tmpe_cw2,2);
yp2[j+8] = _mm_extract_epi16(tmpe_cw2,3);
#ifdef DEBUG_LOGMAP
fprintf(fdavx2,"init3: j %d, s[j] %d yp1[j] %d yp2[j] %d\n",((j>>4)<<3)+(j&7),s[j],yp1[j],yp2[j]);
fprintf(fdavx2b,"init3: j %d, s[j] %d yp1[j] %d yp2[j] %d\n",((j>>4)<<3)+(j&7),s[j+8],yp1[j+8],yp2[j+8]);
#endif
j=pi2_p[4];
s[j] = _mm_extract_epi16(tmpe,4);
yp1[j] = _mm_extract_epi16(tmpe,5);
yp2[j] = _mm_extract_epi16(tmpe,6);
s[j+8] = _mm_extract_epi16(tmpe_cw2,4);
yp1[j+8] = _mm_extract_epi16(tmpe_cw2,5);
yp2[j+8] = _mm_extract_epi16(tmpe_cw2,6);
#ifdef DEBUG_LOGMAP
fprintf(fdavx2,"init4: j %d, s[j] %d yp1[j] %d yp2[j] %d\n",((j>>4)<<3)+(j&7),s[j],yp1[j],yp2[j]);
fprintf(fdavx2b,"init4: j %d, s[j] %d yp1[j] %d yp2[j] %d\n",((j>>4)<<3)+(j&7),s[j+8],yp1[j+8],yp2[j+8]);
#endif
j=pi2_p[5];
s[j] = _mm_extract_epi16(tmpe,7);
tmpe = _mm_load_si128(&yp128[2]);
yp1[j] = _mm_extract_epi16(tmpe,0);
yp2[j] = _mm_extract_epi16(tmpe,1);
s[j+8] = _mm_extract_epi16(tmpe_cw2,7);
tmpe_cw2 = _mm_load_si128(&yp128_cw2[2]);
yp1[j+8] = _mm_extract_epi16(tmpe_cw2,0);
yp2[j+8] = _mm_extract_epi16(tmpe_cw2,1);
#ifdef DEBUG_LOGMAP
fprintf(fdavx2,"init5: j %d, s[j] %d yp1[j] %d yp2[j] %d\n",((j>>4)<<3)+(j&7),s[j],yp1[j],yp2[j]);
fprintf(fdavx2b,"init5: j %d, s[j] %d yp1[j] %d yp2[j] %d\n",((j>>4)<<3)+(j&7),s[j+8],yp1[j+8],yp2[j+8]);
#endif
j=pi2_p[6];
s[j] = _mm_extract_epi16(tmpe,2);
yp1[j] = _mm_extract_epi16(tmpe,3);
yp2[j] = _mm_extract_epi16(tmpe,4);
s[j+8] = _mm_extract_epi16(tmpe_cw2,2);
yp1[j+8] = _mm_extract_epi16(tmpe_cw2,3);
yp2[j+8] = _mm_extract_epi16(tmpe_cw2,4);
#ifdef DEBUG_LOGMAP
fprintf(fdavx2,"init6: j %d, s[j] %d yp1[j] %d yp2[j] %d\n",((j>>4)<<3)+(j&7),s[j],yp1[j],yp2[j]);
fprintf(fdavx2b,"init6: j %d, s[j] %d yp1[j] %d yp2[j] %d\n",((j>>4)<<3)+(j&7),s[j+8],yp1[j+8],yp2[j+8]);
#endif
j=pi2_p[7];
s[j] = _mm_extract_epi16(tmpe,5);
yp1[j] = _mm_extract_epi16(tmpe,6);
yp2[j] = _mm_extract_epi16(tmpe,7);
s[j+8] = _mm_extract_epi16(tmpe_cw2,5);
yp1[j+8] = _mm_extract_epi16(tmpe_cw2,6);
yp2[j+8] = _mm_extract_epi16(tmpe_cw2,7);
#ifdef DEBUG_LOGMAP
fprintf(fdavx2,"init7: j %d, s[j] %d yp1[j] %d yp2[j] %d\n",((j>>4)<<3)+(j&7),s[j],yp1[j],yp2[j]);
fprintf(fdavx2b,"init7: j %d, s[j] %d yp1[j] %d yp2[j] %d\n",((j>>4)<<3)+(j&7),s[j+8],yp1[j+8],yp2[j+8]);
#endif
yp128+=3;
yp128_cw2+=3;
}
yp=(llr_t*)yp128;
yp_cw2=(llr_t*)yp128_cw2;
// Termination
for (i=0; i<3; i++) {
s[(n<<1)+i] = *yp;
s1[(n<<1)+i] = *yp;
s2[(n<<1)+i] = *yp;
yp++;
yp1[(n<<1)+i] = *yp;
yp++;
s[(n<<1)+i+8] = *yp_cw2;
s1[(n<<1)+i+8] = *yp_cw2;
s2[(n<<1)+i+8] = *yp_cw2;
yp_cw2++;
yp1[(n<<1)+i+8] = *yp_cw2;
yp_cw2++;
#ifdef DEBUG_LOGMAP
fprintf(fdavx2,"Term 1 (%d): %d %d\n",n+i,s[(n<<1)+i],yp1[(n<<1)+i]);
fprintf(fdavx2b,"Term 1 (%d): %d %d\n",n+i,s[(n<<1)+i+8],yp1[(n<<1)+i+8]);
#endif //DEBUG_LOGMAP
}
for (i=16; i<19; i++) {
s[(n<<1)+i] = *yp;
s1[(n<<1)+i] = *yp;
s2[(n<<1)+i] = *yp;
yp++;
yp2[(n<<1)+(i-16)] = *yp;
yp++;
s[(n<<1)+i+8]= *yp_cw2;
s1[(n<<1)+i+8] = *yp_cw2 ;
s2[(n<<1)+i+8] = *yp_cw2;
yp_cw2++;
yp2[(n<<1)+i-16+8] = *yp_cw2;
yp_cw2++;
#ifdef DEBUG_LOGMAP
fprintf(fdavx2,"Term 2 (%d): %d %d\n",n+i-3-8,s[(n<<1)+i],yp2[(n<<1)+i-16]);
fprintf(fdavx2b,"Term 2 (%d): %d %d\n",n+i-3-8,s[(n<<1)+i+8],yp2[(n<<1)+i-16+8]);
#endif //DEBUG_LOGMAP
}
#ifdef DEBUG_LOGMAP
fprintf(fdavx2,"\n");
fprintf(fdavx2b,"\n");
#endif //DEBUG_LOGMAP
stop_meas(init_stats);
// do log_map from first parity bit
log_map16avx2(systematic0,yparity1,m11,m10,alpha,beta,ext,n,0,F,offset8_flag,alpha_stats,beta_stats,gamma_stats,ext_stats);
while (iteration_cnt++ < max_iterations) {
#ifdef DEBUG_LOGMAP
fprintf(fdavx2,"\n*******************ITERATION %d (n %d), ext %p\n\n",iteration_cnt,n,ext);
fprintf(fdavx2b,"\n*******************ITERATION %d (n %d), ext %p\n\n",iteration_cnt,n,ext);
#endif //DEBUG_LOGMAP
start_meas(intl1_stats);
pi4_p=pi4tab16avx2[iind];
for (i=0; i<(n>>3); i++) { // steady-state portion
((__m256i *)systematic2)[i]=_mm256_insert_epi16(((__m256i *)systematic2)[i],ext[*pi4_p],0);
((__m256i *)systematic2)[i]=_mm256_insert_epi16(((__m256i *)systematic2)[i],ext[8+*pi4_p++],8);
((__m256i *)systematic2)[i]=_mm256_insert_epi16(((__m256i *)systematic2)[i],ext[*pi4_p],1);
((__m256i *)systematic2)[i]=_mm256_insert_epi16(((__m256i *)systematic2)[i],ext[8+*pi4_p++],9);
((__m256i *)systematic2)[i]=_mm256_insert_epi16(((__m256i *)systematic2)[i],ext[*pi4_p],2);
((__m256i *)systematic2)[i]=_mm256_insert_epi16(((__m256i *)systematic2)[i],ext[8+*pi4_p++],10);
((__m256i *)systematic2)[i]=_mm256_insert_epi16(((__m256i *)systematic2)[i],ext[*pi4_p],3);
((__m256i *)systematic2)[i]=_mm256_insert_epi16(((__m256i *)systematic2)[i],ext[8+*pi4_p++],11);
((__m256i *)systematic2)[i]=_mm256_insert_epi16(((__m256i *)systematic2)[i],ext[*pi4_p],4);
((__m256i *)systematic2)[i]=_mm256_insert_epi16(((__m256i *)systematic2)[i],ext[8+*pi4_p++],12);
((__m256i *)systematic2)[i]=_mm256_insert_epi16(((__m256i *)systematic2)[i],ext[*pi4_p],5);
((__m256i *)systematic2)[i]=_mm256_insert_epi16(((__m256i *)systematic2)[i],ext[8+*pi4_p++],13);
((__m256i *)systematic2)[i]=_mm256_insert_epi16(((__m256i *)systematic2)[i],ext[*pi4_p],6);
((__m256i *)systematic2)[i]=_mm256_insert_epi16(((__m256i *)systematic2)[i],ext[8+*pi4_p++],14);
((__m256i *)systematic2)[i]=_mm256_insert_epi16(((__m256i *)systematic2)[i],ext[*pi4_p],7);
((__m256i *)systematic2)[i]=_mm256_insert_epi16(((__m256i *)systematic2)[i],ext[8+*pi4_p++],15);
#ifdef DEBUG_LOGMAP
print_shorts("syst2",(int16_t*)&((__m256i *)systematic2)[i]);
#endif
}
stop_meas(intl1_stats);
// do log_map from second parity bit
log_map16avx2(systematic2,yparity2,m11,m10,alpha,beta,ext2,n,1,F,offset8_flag,alpha_stats,beta_stats,gamma_stats,ext_stats);
pi5_p=pi5tab16avx2[iind];
for (i=0; i<(n>>3); i++) {
tmp=_mm256_insert_epi16(tmp,ext2[*pi5_p],0);
tmp=_mm256_insert_epi16(tmp,ext2[8+*pi5_p++],8);
tmp=_mm256_insert_epi16(tmp,ext2[*pi5_p],1);
tmp=_mm256_insert_epi16(tmp,ext2[8+*pi5_p++],9);
tmp=_mm256_insert_epi16(tmp,ext2[*pi5_p],2);
tmp=_mm256_insert_epi16(tmp,ext2[8+*pi5_p++],10);
tmp=_mm256_insert_epi16(tmp,ext2[*pi5_p],3);
tmp=_mm256_insert_epi16(tmp,ext2[8+*pi5_p++],11);
tmp=_mm256_insert_epi16(tmp,ext2[*pi5_p],4);
tmp=_mm256_insert_epi16(tmp,ext2[8+*pi5_p++],12);
tmp=_mm256_insert_epi16(tmp,ext2[*pi5_p],5);
tmp=_mm256_insert_epi16(tmp,ext2[8+*pi5_p++],13);
tmp=_mm256_insert_epi16(tmp,ext2[*pi5_p],6);
tmp=_mm256_insert_epi16(tmp,ext2[8+*pi5_p++],14);
tmp=_mm256_insert_epi16(tmp,ext2[*pi5_p],7);
tmp=_mm256_insert_epi16(tmp,ext2[8+*pi5_p++],15);
((__m256i *)systematic1)[i] = _mm256_adds_epi16(_mm256_subs_epi16(tmp,((__m256i*)ext)[i]),((__m256i *)systematic0)[i]);
#ifdef DEBUG_LOGMAP
print_shorts("syst1",(int16_t*)&((__m256i *)systematic1)[i]);
#endif
}
if (iteration_cnt>1) {
start_meas(intl2_stats);
pi6_p=pi6tab16avx2[iind];
for (i=0; i<(n>>3); i++) {
tmp=_mm256_insert_epi16(tmp, ((llr_t*)ext2)[*pi6_p],7);
tmp=_mm256_insert_epi16(tmp, ((llr_t*)ext2)[8+*pi6_p++],15);
tmp=_mm256_insert_epi16(tmp, ((llr_t*)ext2)[*pi6_p],6);
tmp=_mm256_insert_epi16(tmp, ((llr_t*)ext2)[8+*pi6_p++],14);
tmp=_mm256_insert_epi16(tmp, ((llr_t*)ext2)[*pi6_p],5);
tmp=_mm256_insert_epi16(tmp, ((llr_t*)ext2)[8+*pi6_p++],13);
tmp=_mm256_insert_epi16(tmp, ((llr_t*)ext2)[*pi6_p],4);
tmp=_mm256_insert_epi16(tmp, ((llr_t*)ext2)[8+*pi6_p++],12);
tmp=_mm256_insert_epi16(tmp, ((llr_t*)ext2)[*pi6_p],3);
tmp=_mm256_insert_epi16(tmp, ((llr_t*)ext2)[8+*pi6_p++],11);
tmp=_mm256_insert_epi16(tmp, ((llr_t*)ext2)[*pi6_p],2);
tmp=_mm256_insert_epi16(tmp, ((llr_t*)ext2)[8+*pi6_p++],10);
tmp=_mm256_insert_epi16(tmp, ((llr_t*)ext2)[*pi6_p],1);
tmp=_mm256_insert_epi16(tmp, ((llr_t*)ext2)[8+*pi6_p++],9);
tmp=_mm256_insert_epi16(tmp, ((llr_t*)ext2)[*pi6_p],0);
tmp=_mm256_insert_epi16(tmp, ((llr_t*)ext2)[8+*pi6_p++],8);
#ifdef DEBUG_LOGMAP
print_shorts("tmp",(int16_t*)&tmp);
#endif
tmp=_mm256_cmpgt_epi8(_mm256_packs_epi16(tmp,zeros),zeros);
db=(uint32_t)_mm256_movemask_epi8(tmp);
decoded_bytes[i]=db&0xff;
decoded_bytes2[i]=(uint8_t)(db>>16)&0xff;
#ifdef DEBUG_LOGMAP
print_shorts("tmp",(int16_t*)&tmp);
fprintf(fdavx2,"decoded_bytes[%d] %x (%x)\n",i,decoded_bytes[i],db);
fprintf(fdavx2b,"decoded_bytes[%d] %x (%x)\n",i,decoded_bytes2[i],db);
#endif
}
}
// check status on output
if (iteration_cnt>1) {
oldcrc= *((uint32_t *)(&decoded_bytes[(n>>3)-crc_len]));
switch (crc_type) {
case CRC24_A:
oldcrc&=0x00ffffff;
crc = crc24a(&decoded_bytes[F>>3],
n-24-F)>>8;
temp=((uint8_t *)&crc)[2];
((uint8_t *)&crc)[2] = ((uint8_t *)&crc)[0];
((uint8_t *)&crc)[0] = temp;
break;
case CRC24_B:
oldcrc&=0x00ffffff;
crc = crc24b(decoded_bytes,
n-24)>>8;
temp=((uint8_t *)&crc)[2];
((uint8_t *)&crc)[2] = ((uint8_t *)&crc)[0];
((uint8_t *)&crc)[0] = temp;
break;
case CRC16:
oldcrc&=0x0000ffff;
crc = crc16(decoded_bytes,
n-16)>>16;
break;
case CRC8:
oldcrc&=0x000000ff;
crc = crc8(decoded_bytes,
n-8)>>24;
break;
default:
printf("FATAL: 3gpplte_turbo_decoder_sse.c: Unknown CRC\n");
return(255);
break;
}
// second CW
oldcrc_cw2= *((uint32_t *)(&decoded_bytes2[(n>>3)-crc_len]));
switch (crc_type) {
case CRC24_A:
oldcrc_cw2&=0x00ffffff;
crc_cw2 = crc24a(&decoded_bytes2[F>>3],
n-24-F)>>8;
temp=((uint8_t *)&crc_cw2)[2];
((uint8_t *)&crc_cw2)[2] = ((uint8_t *)&crc_cw2)[0];
((uint8_t *)&crc_cw2)[0] = temp;
break;
case CRC24_B:
oldcrc_cw2&=0x00ffffff;
crc_cw2 = crc24b(decoded_bytes2,
n-24)>>8;
temp=((uint8_t *)&crc_cw2)[2];
((uint8_t *)&crc_cw2)[2] = ((uint8_t *)&crc_cw2)[0];
((uint8_t *)&crc_cw2)[0] = temp;
break;
case CRC16:
oldcrc_cw2&=0x0000ffff;
crc_cw2 = crc16(decoded_bytes2,
n-16)>>16;
break;
case CRC8:
oldcrc_cw2&=0x000000ff;
crc_cw2 = crc8(decoded_bytes2,
n-8)>>24;
break;
default:
printf("FATAL: 3gpplte_turbo_decoder_sse.c: Unknown CRC\n");
return(255);
break;
}
stop_meas(intl2_stats);
#ifdef DEBUG_LOGMAP
fprintf(fdavx2,"oldcrc %x, crc %x, oldcrc_cw2 %x, crc_cw2 %x\n",oldcrc,crc,oldcrc_cw2,crc_cw2);
fprintf(fdavx2b,"oldcrc %x, crc %x, oldcrc_cw2 %x, crc_cw2 %x\n",oldcrc,crc,oldcrc_cw2,crc_cw2);
#endif
if ((crc == oldcrc) && (crc!=0) && (crc_cw2 == oldcrc_cw2) && (crc_cw2!=0)) {
return(iteration_cnt);
}
}
// do log_map from first parity bit
if (iteration_cnt < max_iterations) {
log_map16avx2(systematic1,yparity1,m11,m10,alpha,beta,ext,n,0,F,offset8_flag,alpha_stats,beta_stats,gamma_stats,ext_stats);
__m256i* ext_128=(__m256i*) ext;
__m256i* s1_128=(__m256i*) systematic1;
__m256i* s0_128=(__m256i*) systematic0;
int myloop=n>>3;
for (i=0; i<myloop; i++) {
*ext_128=_mm256_adds_epi16(_mm256_subs_epi16(*ext_128,*s1_128++),*s0_128++);
ext_128++;
}
}
}
// fprintf(fdavx2,"crc %x, oldcrc %x\n",crc,oldcrc);
_mm_empty();
_m_empty();
#ifdef DEBUG_LOGMAP
fclose(fdavx2);
#endif
return(iteration_cnt);
}
#endif __AVX2__
......@@ -63,11 +63,17 @@
#include "mex.h"
#endif
//#define DEBUG_LOGMAP
#ifdef DEBUG_LOGMAP
#define print_shorts(s,x) fprintf(fdsse4,"%s %d,%d,%d,%d,%d,%d,%d,%d\n",s,(x)[0],(x)[1],(x)[2],(x)[3],(x)[4],(x)[5],(x)[6],(x)[7])
#endif
#define print_shorts(s,x) printf("%s %d,%d,%d,%d,%d,%d,%d,%d\n",s,(x)[0],(x)[1],(x)[2],(x)[3],(x)[4],(x)[5],(x)[6],(x)[7])
//#define DEBUG_LOGMAP
#ifdef DEBUG_LOGMAP
FILE *fdsse4;
#endif
typedef int16_t llr_t; // internal decoder LLR data is 16-bit fixed
typedef int16_t channel_t;
......@@ -99,7 +105,7 @@ void log_map16(llr_t* systematic,
{
#ifdef DEBUG_LOGMAP
msg("log_map, frame_length %d\n",frame_length);
fprintf(fdsse4,"log_map, frame_length %d\n",frame_length);
#endif
start_meas(gamma_stats) ;
......@@ -135,7 +141,7 @@ void compute_gamma16(llr_t* m11,llr_t* m10,llr_t* systematic,channel_t* y_parity
#endif
#ifdef DEBUG_LOGMAP
msg("compute_gamma, %p,%p,%p,%p,framelength %d\n",m11,m10,systematic,y_parity,frame_length);
fprintf(fdsse4,"compute_gamma (sse_16bit), %p,%p,%p,%p,framelength %d\n",m11,m10,systematic,y_parity,frame_length);
#endif
K1=frame_length>>3;
......@@ -150,7 +156,7 @@ void compute_gamma16(llr_t* m11,llr_t* m10,llr_t* systematic,channel_t* y_parity
#endif
#ifdef DEBUG_LOGMAP
printf("Loop index k, m11,m10\n");
fprintf(fdsse4,"Loop index k %d\n", k);
print_shorts("sys",(int16_t*)&systematic128[k]);
print_shorts("yp",(int16_t*)&y_parity128[k]);
print_shorts("m11",(int16_t*)&m11_128[k]);
......@@ -164,7 +170,15 @@ void compute_gamma16(llr_t* m11,llr_t* m10,llr_t* systematic,channel_t* y_parity
m10_128[k] = _mm_srai_epi16(_mm_subs_epi16(systematic128[k+term_flag],y_parity128[k]),1);
#elif defined(__arm__)
m11_128[k] = vhaddq_s16(systematic128[k+term_flag],y_parity128[k]);
m10_128[k] = vhsubq_s16(systematic128[k+term_flag],y_parity128[k]);
m10_128[k] = vhsubq_s16(systematic128[k+term_flag],y_parity128[k]);
#endif
#ifdef DEBUG_LOGMAP
fprintf(fdsse4,"Loop index k %d (term flag %d)\n", k,term_flag);
print_shorts("sys",(int16_t*)&systematic128[k]);
print_shorts("yp",(int16_t*)&y_parity128[k]);
print_shorts("m11",(int16_t*)&m11_128[k]);
print_shorts("m10",(int16_t*)&m10_128[k]);
#endif
}
......@@ -188,7 +202,9 @@ void compute_alpha16(llr_t* alpha,llr_t* beta,llr_t* m_11,llr_t* m_10,unsigned s
#endif
l2 = L>>3;
K1 = (frame_length>>3);
#ifdef DEBUG_LOGMAP
fprintf(fdsse4,"compute_alpha (sse_16bit)\n");
#endif
for (l=K1;; l=l2,rerun_flag=1) {
#if defined(__x86_64__) || defined(__i386__)
alpha128 = (__m128i *)alpha;
......@@ -218,7 +234,7 @@ void compute_alpha16(llr_t* alpha,llr_t* beta,llr_t* m_11,llr_t* m_10,unsigned s
alpha128[7] = vdupq_n_s16(-MAX/2);
#endif
#ifdef DEBUG_LOGMAP
printf("Initial alpha\n");
fprintf(fdsse4,"Initial alpha\n");
print_shorts("a0",(int16_t*)&alpha128[0]);
print_shorts("a1",(int16_t*)&alpha128[1]);
print_shorts("a2",(int16_t*)&alpha128[2]);
......@@ -258,7 +274,7 @@ void compute_alpha16(llr_t* alpha,llr_t* beta,llr_t* m_11,llr_t* m_10,unsigned s
alpha[48] = -MAX/2;
alpha[56] = -MAX/2;
#ifdef DEBUG_LOGMAP
printf("Second run\n");
fprintf(fdsse4,"Second run\n");
print_shorts("a0",(int16_t*)&alpha128[0]);
print_shorts("a1",(int16_t*)&alpha128[1]);
print_shorts("a2",(int16_t*)&alpha128[2]);
......@@ -390,7 +406,7 @@ void compute_alpha16(llr_t* alpha,llr_t* beta,llr_t* m_11,llr_t* m_10,unsigned s
#endif
#ifdef DEBUG_LOGMAP
printf("Loop index %d, mb\n",k);
fprintf(fdsse4,"Loop index %d\n",k);
print_shorts("mb0",(int16_t*)&m_b0);
print_shorts("mb1",(int16_t*)&m_b1);
print_shorts("mb2",(int16_t*)&m_b2);
......@@ -400,7 +416,7 @@ void compute_alpha16(llr_t* alpha,llr_t* beta,llr_t* m_11,llr_t* m_10,unsigned s
print_shorts("mb6",(int16_t*)&m_b6);
print_shorts("mb7",(int16_t*)&m_b7);
printf("Loop index %d, new\n",k);
fprintf(fdsse4,"Loop index %d, new\n",k);
print_shorts("new0",(int16_t*)&new0);
print_shorts("new1",(int16_t*)&new1);
print_shorts("new2",(int16_t*)&new2);
......@@ -410,7 +426,7 @@ void compute_alpha16(llr_t* alpha,llr_t* beta,llr_t* m_11,llr_t* m_10,unsigned s
print_shorts("new6",(int16_t*)&new6);
print_shorts("new7",(int16_t*)&new7);
printf("Loop index %d, after max\n",k);
fprintf(fdsse4,"Loop index %d, after max\n",k);
print_shorts("a0",(int16_t*)&a0);
print_shorts("a1",(int16_t*)&a1);
print_shorts("a2",(int16_t*)&a2);
......@@ -420,7 +436,7 @@ void compute_alpha16(llr_t* alpha,llr_t* beta,llr_t* m_11,llr_t* m_10,unsigned s
print_shorts("a6",(int16_t*)&a6);
print_shorts("a7",(int16_t*)&a7);
printf("Loop index %d\n",k);
fprintf(fdsse4,"Loop index %d\n",k);
print_shorts("a0",(int16_t*)&alpha_ptr[0]);
print_shorts("a1",(int16_t*)&alpha_ptr[1]);
print_shorts("a2",(int16_t*)&alpha_ptr[2]);
......@@ -463,25 +479,29 @@ void compute_beta16(llr_t* alpha,llr_t* beta,llr_t *m_11,llr_t* m_10,unsigned sh
llr_t beta0,beta1;
#ifdef DEBUG_LOGMAP
msg("compute_beta, %p,%p,%p,%p,framelength %d,F %d\n",
beta,m_11,m_10,alpha,frame_length,F);
fprintf(fdsse4,"compute_beta, %p,%p,%p,%p,framelength %d,F %d\n",
beta,m_11,m_10,alpha,frame_length,F);
#endif
// termination for beta initialization
// printf("beta init: offset8 %d\n",offset8_flag);
// fprintf(fdsse4,"beta init: offset8 %d\n",offset8_flag);
m11=(int16_t)m_11[2+frame_length];
m10=(int16_t)m_10[2+frame_length];
// printf("m11,m10 %d,%d\n",m11,m10);
#ifdef DEBUG_LOGMAP
fprintf(fdsse4,"m11,m10 %d,%d\n",m11,m10);
#endif
beta0 = -m11;//M0T_TERM;
beta1 = m11;//M1T_TERM;
m11=(int16_t)m_11[1+frame_length];
m10=(int16_t)m_10[1+frame_length];
// printf("m11,m10 %d,%d\n",m11,m10);
#ifdef DEBUG_LOGMAP
fprintf(fdsse4,"m11,m10 %d,%d\n",m11,m10);
#endif
beta0_2 = beta0-m11;//+M0T_TERM;
beta1_2 = beta0+m11;//+M1T_TERM;
......@@ -489,8 +509,9 @@ void compute_beta16(llr_t* alpha,llr_t* beta,llr_t *m_11,llr_t* m_10,unsigned sh
beta3_2 = beta1-m10;//+M3T_TERM;
m11=(int16_t)m_11[frame_length];
m10=(int16_t)m_10[frame_length];
// printf("m11,m10 %d,%d (%p)\n",m11,m10,m_11+frame_length);
#ifdef DEBUG_LOGMAP
fprintf(fdsse4,"m11,m10 %d,%d\n",m11,m10);
#endif
beta0_16 = beta0_2-m11;//+M0T_TERM;
beta1_16 = beta0_2+m11;//+M1T_TERM;
beta2_16 = beta1_2+m10;//+M2T_TERM;
......@@ -536,6 +557,17 @@ void compute_beta16(llr_t* alpha,llr_t* beta,llr_t *m_11,llr_t* m_10,unsigned sh
beta_ptr[5] = alpha128[5+(frame_length)];
beta_ptr[6] = alpha128[6+(frame_length)];
beta_ptr[7] = alpha128[7+(frame_length)];
#ifdef DEBUG_LOGMAP
fprintf(fdsse4,"beta init \n");
print_shorts("b0",(int16_t*)&beta_ptr[0]);
print_shorts("b1",(int16_t*)&beta_ptr[1]);
print_shorts("b2",(int16_t*)&beta_ptr[2]);
print_shorts("b3",(int16_t*)&beta_ptr[3]);
print_shorts("b4",(int16_t*)&beta_ptr[4]);
print_shorts("b5",(int16_t*)&beta_ptr[5]);
print_shorts("b6",(int16_t*)&beta_ptr[6]);
print_shorts("b7",(int16_t*)&beta_ptr[7]);
#endif
} else {
#if defined(__x86_64__) || defined(__i386__)
beta128 = (__m128i*)&beta[0];
......@@ -558,6 +590,17 @@ void compute_beta16(llr_t* alpha,llr_t* beta,llr_t *m_11,llr_t* m_10,unsigned sh
beta_ptr[5] = (int16x8_t)vshrq_n_s64((int64x2_t)beta128[5],16); beta_ptr[5] = vsetq_lane_s16(beta[43],beta_ptr[5],4);
beta_ptr[6] = (int16x8_t)vshrq_n_s64((int64x2_t)beta128[6],16); beta_ptr[6] = vsetq_lane_s16(beta[51],beta_ptr[6],4);
beta_ptr[7] = (int16x8_t)vshrq_n_s64((int64x2_t)beta128[7],16); beta_ptr[7] = vsetq_lane_s16(beta[59],beta_ptr[7],4);
#endif
#ifdef DEBUG_LOGMAP
fprintf(fdsse4,"beta init (second run) \n");
print_shorts("b0",(int16_t*)&beta_ptr[0]);
print_shorts("b1",(int16_t*)&beta_ptr[1]);
print_shorts("b2",(int16_t*)&beta_ptr[2]);
print_shorts("b3",(int16_t*)&beta_ptr[3]);
print_shorts("b4",(int16_t*)&beta_ptr[4]);
print_shorts("b5",(int16_t*)&beta_ptr[5]);
print_shorts("b6",(int16_t*)&beta_ptr[6]);
print_shorts("b7",(int16_t*)&beta_ptr[7]);
#endif
}
......@@ -582,6 +625,17 @@ void compute_beta16(llr_t* alpha,llr_t* beta,llr_t *m_11,llr_t* m_10,unsigned sh
beta_ptr[7] = vsetq_lane_s16(beta7_16,beta_ptr[7],7);
#endif
#ifdef DEBUG_LOGMAP
fprintf(fdsse4,"beta init (after insert) \n");
print_shorts("b0",(int16_t*)&beta_ptr[0]);
print_shorts("b1",(int16_t*)&beta_ptr[1]);
print_shorts("b2",(int16_t*)&beta_ptr[2]);
print_shorts("b3",(int16_t*)&beta_ptr[3]);
print_shorts("b4",(int16_t*)&beta_ptr[4]);
print_shorts("b5",(int16_t*)&beta_ptr[5]);
print_shorts("b6",(int16_t*)&beta_ptr[6]);
print_shorts("b7",(int16_t*)&beta_ptr[7]);
#endif
int loopval=((rerun_flag==0)?0:((frame_length-L)>>3));
for (k=(frame_length>>3)-1; k>=loopval; k--) {
......@@ -684,6 +738,18 @@ void compute_beta16(llr_t* alpha,llr_t* beta,llr_t *m_11,llr_t* m_10,unsigned sh
beta_ptr[7] = vqsubq_s16(beta_ptr[7],beta_max);
#endif
#ifdef DEBUG_LOGMAP
fprintf(fdsse4,"Loop index %d, mb\n",k);
fprintf(fdsse4,"beta init (after max)\n");
print_shorts("b0",(int16_t*)&beta_ptr[0]);
print_shorts("b1",(int16_t*)&beta_ptr[1]);
print_shorts("b2",(int16_t*)&beta_ptr[2]);
print_shorts("b3",(int16_t*)&beta_ptr[3]);
print_shorts("b4",(int16_t*)&beta_ptr[4]);
print_shorts("b5",(int16_t*)&beta_ptr[5]);
print_shorts("b6",(int16_t*)&beta_ptr[6]);
print_shorts("b7",(int16_t*)&beta_ptr[7]);
#endif
}
......@@ -721,7 +787,7 @@ void compute_ext16(llr_t* alpha,llr_t* beta,llr_t* m_11,llr_t* m_10,llr_t* ext,
//
#ifdef DEBUG_LOGMAP
msg("compute_ext, %p, %p, %p, %p, %p, %p ,framelength %d\n",alpha,beta,m_11,m_10,ext,systematic,frame_length);
fprintf(fdsse4,"compute_ext (sse_16bit), %p, %p, %p, %p, %p, %p ,framelength %d\n",alpha,beta,m_11,m_10,ext,systematic,frame_length);
#endif
alpha_ptr = alpha128;
......@@ -736,7 +802,7 @@ void compute_ext16(llr_t* alpha,llr_t* beta,llr_t* m_11,llr_t* m_10,llr_t* ext,
ext_128 = (__m128i*)&ext[k<<3];
/*
printf("EXT %03d\n",k);
fprintf(fdsse4,"EXT %03d\n",k);
print_shorts("a0:",&alpha_ptr[0]);
print_shorts("a1:",&alpha_ptr[1]);
print_shorts("a2:",&alpha_ptr[2]);
......@@ -816,15 +882,15 @@ void compute_ext16(llr_t* alpha,llr_t* beta,llr_t* m_11,llr_t* m_10,llr_t* ext,
// print_shorts("m10_1:",&m10_1);
*ext_128 = _mm_subs_epi16(m10_1,m01_1);
#ifdef DEBUG_LOGMAP
fprintf(fdsse4,"ext %p\n",ext_128);
print_shorts("ext:",(int16_t*)ext_128);
print_shorts("m11:",(int16_t*)m11_128);
print_shorts("m10:",(int16_t*)m10_128);
print_shorts("m10_1:",(int16_t*)&m10_1);
print_shorts("m01_1:",(int16_t*)&m01_1);
#endif
/*
print_shorts("ext:",ext_128);
print_shorts("m11:",m11_128);
print_shorts("m10:",m10_128);
print_shorts("m10_1:",&m10_1);
print_shorts("m01_1:",&m01_1);
print_shorts("syst:",systematic_128);
*/
#elif defined(__arm__)
m11_128 = (int16x8_t*)&m_11[k<<3];
m10_128 = (int16x8_t*)&m_10[k<<3];
......@@ -927,7 +993,7 @@ void init_td16()
// j-=(n-1);
pi2tab16[ind][i] = j;
// printf("pi2[%d] = %d\n",i,j);
// fprintf(fdsse4,"pi2[%d] = %d\n",i,j);
}
}
......@@ -989,7 +1055,7 @@ unsigned char phy_threegpplte_turbo_decoder16(short *y,
#if defined(__x86_64__) || defined(__i386__)
__m128i *yp128;
__m128i tmp, zeros=_mm_setzero_si128();
register __m128i tmpe;
__m128i tmpe;
#elif defined(__arm__)
int16x8_t *yp128;
// int16x8_t tmp128[(n+8)>>3];
......@@ -1000,12 +1066,20 @@ unsigned char phy_threegpplte_turbo_decoder16(short *y,
#endif
int offset8_flag=0;
#ifdef DEBUG_LOGMAP
fdsse4 = fopen("dump_sse4.txt","w");
printf("tc sse4_16 (y) %p\n",y);
#endif
if (crc_type > 3) {
msg("Illegal crc length!\n");
printf("Illegal crc length!\n");
return 255;
}
start_meas(init_stats);
......@@ -1013,7 +1087,7 @@ unsigned char phy_threegpplte_turbo_decoder16(short *y,
for (iind=0; iind < 188 && f1f2mat[iind].nb_bits != n; iind++);
if ( iind == 188 ) {
msg("Illegal frame length!\n");
printf("Illegal frame length!\n");
return 255;
}
......@@ -1059,62 +1133,74 @@ unsigned char phy_threegpplte_turbo_decoder16(short *y,
#if defined(__x86_64__) || defined(__i386__)
tmpe = _mm_load_si128(yp128);
// fprintf(fdsse4,"yp128 %p\n",yp128);
// print_shorts("tmpe",(int16_t *)&tmpe);
s[j] = _mm_extract_epi16(tmpe,0);
yp1[j] = _mm_extract_epi16(tmpe,1);
yp2[j] = _mm_extract_epi16(tmpe,2);
// printf("init: j %d, s[j] %d yp1[j] %d yp2[j] %d\n",j,s[j],yp1[j],yp2[j]);
#ifdef DEBUG_LOGMAP
fprintf(fdsse4,"init0: j %d, s[j] %d yp1[j] %d yp2[j] %d\n",j,s[j],yp1[j],yp2[j]);
#endif
j=pi2_p[1];
s[j] = _mm_extract_epi16(tmpe,3);
yp1[j] = _mm_extract_epi16(tmpe,4);
yp2[j] = _mm_extract_epi16(tmpe,5);
// printf("init: j %d, s[j] %d yp1[j] %d yp2[j] %d\n",j,s[j],yp1[j],yp2[j]);
#ifdef DEBUG_LOGMAP
fprintf(fdsse4,"init1: j %d, s[j] %d yp1[j] %d yp2[j] %d\n",j,s[j],yp1[j],yp2[j]);
#endif
j=pi2_p[2];
s[j] = _mm_extract_epi16(tmpe,6);
yp1[j] = _mm_extract_epi16(tmpe,7);
tmpe = _mm_load_si128(&yp128[1]);
yp2[j] = _mm_extract_epi16(tmpe,0);
// printf("init: j %d, s[j] %d yp1[j] %d yp2[j] %d\n",j,s[j],yp1[j],yp2[j]);
#ifdef DEBUG_LOGMAP
fprintf(fdsse4,"init2: j %d, s[j] %d yp1[j] %d yp2[j] %d\n",j,s[j],yp1[j],yp2[j]);
#endif
j=pi2_p[3];
s[j] = _mm_extract_epi16(tmpe,1);
yp1[j] = _mm_extract_epi16(tmpe,2);
yp2[j] = _mm_extract_epi16(tmpe,3);
// printf("init: j %d, s[j] %d yp1[j] %d yp2[j] %d\n",j,s[j],yp1[j],yp2[j]);
#ifdef DEBUG_LOGMAP
fprintf(fdsse4,"init3: j %d, s[j] %d yp1[j] %d yp2[j] %d\n",j,s[j],yp1[j],yp2[j]);
#endif
j=pi2_p[4];
s[j] = _mm_extract_epi16(tmpe,4);
yp1[j] = _mm_extract_epi16(tmpe,5);
yp2[j] = _mm_extract_epi16(tmpe,6);
// printf("init: j %d, s[j] %d yp1[j] %d yp2[j] %d\n",j,s[j],yp1[j],yp2[j]);
#ifdef DEBUG_LOGMAP
fprintf(fdsse4,"init4: j %d, s[j] %d yp1[j] %d yp2[j] %d\n",j,s[j],yp1[j],yp2[j]);
#endif
j=pi2_p[5];
s[j] = _mm_extract_epi16(tmpe,7);
tmpe = _mm_load_si128(&yp128[2]);
yp1[j] = _mm_extract_epi16(tmpe,0);
yp2[j] = _mm_extract_epi16(tmpe,1);
// printf("init: j %d, s[j] %d yp1[j] %d yp2[j] %d\n",j,s[j],yp1[j],yp2[j]);
#ifdef DEBUG_LOGMAP
fprintf(fdsse4,"init5: j %d, s[j] %d yp1[j] %d yp2[j] %d\n",j,s[j],yp1[j],yp2[j]);
#endif
j=pi2_p[6];
s[j] = _mm_extract_epi16(tmpe,2);
yp1[j] = _mm_extract_epi16(tmpe,3);
yp2[j] = _mm_extract_epi16(tmpe,4);
// printf("init: j %d, s[j] %d yp1[j] %d yp2[j] %d\n",j,s[j],yp1[j],yp2[j]);
#ifdef DEBUG_LOGMAP
fprintf(fdsse4,"init6: j %d, s[j] %d yp1[j] %d yp2[j] %d\n",j,s[j],yp1[j],yp2[j]);
#endif
j=pi2_p[7];
s[j] = _mm_extract_epi16(tmpe,5);
yp1[j] = _mm_extract_epi16(tmpe,6);
yp2[j] = _mm_extract_epi16(tmpe,7);
// printf("init: j %d, s[j] %d yp1[j] %d yp2[j] %d\n",j,s[j],yp1[j],yp2[j]);
#ifdef DEBUG_LOGMAP
fprintf(fdsse4,"init7: j %d, s[j] %d yp1[j] %d yp2[j] %d\n",j,s[j],yp1[j],yp2[j]);
#endif
#elif defined(__arm__)
s[j] = vgetq_lane_s16(yp128[0],0);
......@@ -1172,7 +1258,7 @@ unsigned char phy_threegpplte_turbo_decoder16(short *y,
yp1[i] = *yp;
yp++;
#ifdef DEBUG_LOGMAP
msg("Term 1 (%d): %d %d\n",i,s[i],yp1[i]);
fprintf(fdsse4,"Term 1 (%d): %d %d\n",i,s[i],yp1[i]);
#endif //DEBUG_LOGMAP
}
......@@ -1184,12 +1270,12 @@ unsigned char phy_threegpplte_turbo_decoder16(short *y,
yp2[i-8] = *yp;
yp++;
#ifdef DEBUG_LOGMAP
msg("Term 2 (%d): %d %d\n",i-3,s[i],yp2[i-8]);
fprintf(fdsse4,"Term 2 (%d): %d %d\n",i-3,s[i],yp2[i-8]);
#endif //DEBUG_LOGMAP
}
#ifdef DEBUG_LOGMAP
msg("\n");
fprintf(fdsse4,"\n");
#endif //DEBUG_LOGMAP
stop_meas(init_stats);
......@@ -1201,7 +1287,7 @@ unsigned char phy_threegpplte_turbo_decoder16(short *y,
while (iteration_cnt++ < max_iterations) {
#ifdef DEBUG_LOGMAP
printf("\n*******************ITERATION %d (n %d), ext %p\n\n",iteration_cnt,n,ext);
fprintf(fdsse4,"\n*******************ITERATION %d (n %d), ext %p\n\n",iteration_cnt,n,ext);
#endif //DEBUG_LOGMAP
start_meas(intl1_stats);
......@@ -1209,24 +1295,29 @@ unsigned char phy_threegpplte_turbo_decoder16(short *y,
pi4_p=pi4tab16[iind];
for (i=0; i<(n>>3); i++) { // steady-state portion
#if defined(__x86_64__) || defined(__i386__)
((__m128i *)systematic2)[i]=_mm_insert_epi16(((__m128i *)systematic2)[i],((llr_t*)ext)[*pi4_p++],0);
((__m128i *)systematic2)[i]=_mm_insert_epi16(((__m128i *)systematic2)[i],((llr_t*)ext)[*pi4_p++],1);
((__m128i *)systematic2)[i]=_mm_insert_epi16(((__m128i *)systematic2)[i],((llr_t*)ext)[*pi4_p++],2);
((__m128i *)systematic2)[i]=_mm_insert_epi16(((__m128i *)systematic2)[i],((llr_t*)ext)[*pi4_p++],3);
((__m128i *)systematic2)[i]=_mm_insert_epi16(((__m128i *)systematic2)[i],((llr_t*)ext)[*pi4_p++],4);
((__m128i *)systematic2)[i]=_mm_insert_epi16(((__m128i *)systematic2)[i],((llr_t*)ext)[*pi4_p++],5);
((__m128i *)systematic2)[i]=_mm_insert_epi16(((__m128i *)systematic2)[i],((llr_t*)ext)[*pi4_p++],6);
((__m128i *)systematic2)[i]=_mm_insert_epi16(((__m128i *)systematic2)[i],((llr_t*)ext)[*pi4_p++],7);
((__m128i *)systematic2)[i]=_mm_insert_epi16(((__m128i *)systematic2)[i],ext[*pi4_p++],0);
((__m128i *)systematic2)[i]=_mm_insert_epi16(((__m128i *)systematic2)[i],ext[*pi4_p++],1);
((__m128i *)systematic2)[i]=_mm_insert_epi16(((__m128i *)systematic2)[i],ext[*pi4_p++],2);
((__m128i *)systematic2)[i]=_mm_insert_epi16(((__m128i *)systematic2)[i],ext[*pi4_p++],3);
((__m128i *)systematic2)[i]=_mm_insert_epi16(((__m128i *)systematic2)[i],ext[*pi4_p++],4);
((__m128i *)systematic2)[i]=_mm_insert_epi16(((__m128i *)systematic2)[i],ext[*pi4_p++],5);
((__m128i *)systematic2)[i]=_mm_insert_epi16(((__m128i *)systematic2)[i],ext[*pi4_p++],6);
((__m128i *)systematic2)[i]=_mm_insert_epi16(((__m128i *)systematic2)[i],ext[*pi4_p++],7);
#elif defined(__arm__)
((int16x8_t*)systematic2)[i]=vsetq_lane_s16(((llr_t*)ext)[*pi4_p++],((int16x8_t*)systematic2)[i],0);
((int16x8_t*)systematic2)[i]=vsetq_lane_s16(((llr_t*)ext)[*pi4_p++],((int16x8_t*)systematic2)[i],1);
((int16x8_t*)systematic2)[i]=vsetq_lane_s16(((llr_t*)ext)[*pi4_p++],((int16x8_t*)systematic2)[i],2);
((int16x8_t*)systematic2)[i]=vsetq_lane_s16(((llr_t*)ext)[*pi4_p++],((int16x8_t*)systematic2)[i],3);
((int16x8_t*)systematic2)[i]=vsetq_lane_s16(((llr_t*)ext)[*pi4_p++],((int16x8_t*)systematic2)[i],4);
((int16x8_t*)systematic2)[i]=vsetq_lane_s16(((llr_t*)ext)[*pi4_p++],((int16x8_t*)systematic2)[i],5);
((int16x8_t*)systematic2)[i]=vsetq_lane_s16(((llr_t*)ext)[*pi4_p++],((int16x8_t*)systematic2)[i],6);
((int16x8_t*)systematic2)[i]=vsetq_lane_s16(((llr_t*)ext)[*pi4_p++],((int16x8_t*)systematic2)[i],7);
((int16x8_t*)systematic2)[i]=vsetq_lane_s16(ext[*pi4_p++],((int16x8_t*)systematic2)[i],0);
((int16x8_t*)systematic2)[i]=vsetq_lane_s16(ext[*pi4_p++],((int16x8_t*)systematic2)[i],1);
((int16x8_t*)systematic2)[i]=vsetq_lane_s16(ext[*pi4_p++],((int16x8_t*)systematic2)[i],2);
((int16x8_t*)systematic2)[i]=vsetq_lane_s16(ext[*pi4_p++],((int16x8_t*)systematic2)[i],3);
((int16x8_t*)systematic2)[i]=vsetq_lane_s16(ext[*pi4_p++],((int16x8_t*)systematic2)[i],4);
((int16x8_t*)systematic2)[i]=vsetq_lane_s16(ext[*pi4_p++],((int16x8_t*)systematic2)[i],5);
((int16x8_t*)systematic2)[i]=vsetq_lane_s16(ext[*pi4_p++],((int16x8_t*)systematic2)[i],6);
((int16x8_t*)systematic2)[i]=vsetq_lane_s16(ext[*pi4_p++],((int16x8_t*)systematic2)[i],7);
#endif
#ifdef DEBUG_LOGMAP
print_shorts("syst2",(int16_t*)&((__m128i *)systematic2)[i]);
#endif
}
......@@ -1261,6 +1352,9 @@ unsigned char phy_threegpplte_turbo_decoder16(short *y,
tmp=vsetq_lane_s16(ext2[*pi5_p++],tmp,6);
tmp=vsetq_lane_s16(ext2[*pi5_p++],tmp,7);
((int16x8_t *)systematic1)[i] = vqaddq_s16(vqsubq_s16(tmp,((int16x8_t*)ext)[i]),((int16x8_t *)systematic0)[i]);
#endif
#ifdef DEBUG_LOGMAP
print_shorts("syst1",(int16_t*)&((__m128i *)systematic1)[i]);
#endif
}
......@@ -1278,6 +1372,9 @@ unsigned char phy_threegpplte_turbo_decoder16(short *y,
tmp=_mm_insert_epi16(tmp, ((llr_t*)ext2)[*pi6_p++],2);
tmp=_mm_insert_epi16(tmp, ((llr_t*)ext2)[*pi6_p++],1);
tmp=_mm_insert_epi16(tmp, ((llr_t*)ext2)[*pi6_p++],0);
#ifdef DEBUG_LOGMAP
print_shorts("tmp",(int16_t*)&tmp);
#endif
tmp=_mm_cmpgt_epi8(_mm_packs_epi16(tmp,zeros),zeros);
decoded_bytes[i]=(unsigned char)_mm_movemask_epi8(tmp);
#elif defined(__arm__)
......@@ -1297,6 +1394,10 @@ unsigned char phy_threegpplte_turbo_decoder16(short *y,
uint64x2_t Mask = vpaddlq_u32(vpaddlq_u16(vandq_u16(vcgtq_s16(tmp,zeros), Powers)));
uint64x1_t Mask64 = vget_high_u64(Mask)+vget_low_u64(Mask);
decoded_bytes[i] = (uint8_t)Mask64;
#endif
#ifdef DEBUG_LOGMAP
print_shorts("tmp",(int16_t*)&tmp);
fprintf(fdsse4,"decoded_bytes[%d] %x\n",i,decoded_bytes[i]);
#endif
}
}
......@@ -1344,6 +1445,9 @@ unsigned char phy_threegpplte_turbo_decoder16(short *y,
}
stop_meas(intl2_stats);
#ifdef DEBUG_LOGMAP
fprintf(fdsse4,"oldcrc %x, crc %x\n",oldcrc,crc);
#endif
if ((crc == oldcrc) && (crc!=0)) {
return(iteration_cnt);
......@@ -1374,8 +1478,12 @@ unsigned char phy_threegpplte_turbo_decoder16(short *y,
}
}
}
// fprintf(fdsse4,"crc %x, oldcrc %x\n",crc,oldcrc);
// printf("crc %x, oldcrc %x\n",crc,oldcrc);
#ifdef DEBUG_LOGMAP
fclose(fdsse4);
#endif
#if defined(__x86_64__) || defined(__i386__)
_mm_empty();
......
......@@ -483,6 +483,24 @@ uint8_t phy_threegpplte_turbo_decoder16(int16_t *y,
time_stats_t *intl1_stats,
time_stats_t *intl2_stats);
uint8_t phy_threegpplte_turbo_decoder16avx2(int16_t *y,
int16_t *y2,
uint8_t *decoded_bytes,
uint8_t *decoded_bytes2,
uint16_t n,
uint16_t interleaver_f1,
uint16_t interleaver_f2,
uint8_t max_iterations,
uint8_t crc_type,
uint8_t F,
time_stats_t *init_stats,
time_stats_t *alpha_stats,
time_stats_t *beta_stats,
time_stats_t *gamma_stats,
time_stats_t *ext_stats,
time_stats_t *intl1_stats,
time_stats_t *intl2_stats);
/*!
\brief This routine performs max-logmap detection for the 3GPP turbo code (with termination). It is optimized for SIMD processing and 8-bit
LLR arithmetic, and requires SSE2,SSSE3 and SSE4.1 (gcc >=4.3 and appropriate CPU)
......
......@@ -895,7 +895,9 @@ void phy_init_lte_top(LTE_DL_FRAME_PARMS *lte_frame_parms)
init_td8();
init_td16();
#ifdef __AVX2__
init_td16avx2();
#endif
lte_sync_time_init(lte_frame_parms);
......
......@@ -26,187 +26,187 @@
Address : Eurecom, Campus SophiaTech, 450 Route des Chappes, CS 50193 - 06904 Biot Sophia Antipolis cedex, FRANCE
*******************************************************************************/
short filt24_0[24] __attribute__((aligned(16))) ={
short filt24_0[24] __attribute__((aligned(32))) ={
2730,5461,8192,10922,13653,16384,13653,10922,8192,5461,2730,0,0,0,0,0,0,0,0,0,0,0,0,0
};
short filt24_0_dcl[24] __attribute__((aligned(16))) ={
short filt24_0_dcl[24] __attribute__((aligned(32))) ={
2341,4681,7022,9362,11703,16384,13653,10922,8192,5461,2730,0,0,0,0,0,0,0,0,0,0,0,0,0
};
short filt24_0_dcr[24] __attribute__((aligned(16))) ={
short filt24_0_dcr[24] __attribute__((aligned(32))) ={
2730,5461,8192,10922,13653,16384,14043,11703,9362,7022,4681,0,0,0,0,0,0,0,0,0,0,0,0
};
short filt24_1[24] __attribute__((aligned(16))) ={
short filt24_1[24] __attribute__((aligned(32))) ={
0,2730,5461,8192,10922,13653,16384,13653,10922,8192,5461,2730,0,0,0,0,0,0,0,0,0,0,0,0
};
short filt24_1_dcl[24] __attribute__((aligned(16))) ={
short filt24_1_dcl[24] __attribute__((aligned(32))) ={
0,4681,7022,9362,11703,14043,16384,13653,10922,8192,5461,2730,0,0,0,0,0,0,0,0,0,0,0,0
};
short filt24_1_dcr[24] __attribute__((aligned(16))) ={
short filt24_1_dcr[24] __attribute__((aligned(32))) ={
0,2730,5461,8192,10922,13653,16384,14043,11703,9362,7022,4681,0,0,0,0,0,0,0,0,0,0,0,0
};
short filt24_2[24] __attribute__((aligned(16))) ={
short filt24_2[24] __attribute__((aligned(32))) ={
0,0,2730,5461,8192,10922,13653,16384,13653,10922,8192,5461,2730,0,0,0,0,0,0,0,0,0,0,0
};
short filt24_2_dcl[24] __attribute__((aligned(16))) ={
short filt24_2_dcl[24] __attribute__((aligned(32))) ={
0,0,2341,4681,7022,9362, 11703,16384,13653,10922,8192,5461,2730,0,0,0,0,0,0,0,0,0,0,0
};
short filt24_2_dcr[24] __attribute__((aligned(16))) ={
short filt24_2_dcr[24] __attribute__((aligned(32))) ={
0,0,2730,5461,8192,10922,13653,16384,14043,11703,9362,4681,2341,0,0,0,0,0,0,0,0,0,0,0
};
// X X X Y | X X X X | X Y X X
short filt24_3[24] __attribute__((aligned(16))) ={
short filt24_3[24] __attribute__((aligned(32))) ={
0,0,0,2730,5461,8192,10922,13653,16384,13653,10922,8192,5461,2730,0,0,0,0,0,0,0,0,0,0
};
short filt24_3_dcl[24] __attribute__((aligned(16))) ={
short filt24_3_dcl[24] __attribute__((aligned(32))) ={
0,0,0,2341,4681,7022,9362,14043,16384,13653,10922,8192,5461,2730,0,0,0,0,0,0,0,0,0,0
};
// X X X Y | X X DC X X | X Y X X
short filt24_3_dcr[24] __attribute__((aligned(16))) ={
short filt24_3_dcr[24] __attribute__((aligned(32))) ={
0,0,0,2730,5461,8192,10922,13653,16384,14043,11703,7022,4681,2341,0,0,0,0,0,0,0,0,0,0
};
short filt24_4[24] __attribute__((aligned(16))) ={
short filt24_4[24] __attribute__((aligned(32))) ={
0,0,0,0,2730,5461,8192,10922,13653,16384,13653,10922,8192,5461,2730,0,0,0,0,0,0,0,0,0
};
short filt24_4_dcl[24] __attribute__((aligned(16))) ={
short filt24_4_dcl[24] __attribute__((aligned(32))) ={
0,0,0,0,2341,7022,9362,11703,14043,16384,13653,10922,8192,5461,2730,0,0,0,0,0,0,0,0,0
};
short filt24_4_dcr[24] __attribute__((aligned(16))) ={
short filt24_4_dcr[24] __attribute__((aligned(32))) ={
0,0,0,0,2730,5461,8192,10922,13653,16384,14043,11703,7022,4681,2341,0,0,0,0,0,0,0,0,0
};
short filt24_5[24] __attribute__((aligned(16))) ={
short filt24_5[24] __attribute__((aligned(32))) ={
0,0,0,0,0,2730,5461,8192,10922,13653,16384,13653,10922,8192,5461,2730,0,0,0,0,0,0,0,0
};
// X X X Y | X X DC X X | X Y X X
short filt24_5_dcl[24] __attribute__((aligned(16))) ={
short filt24_5_dcl[24] __attribute__((aligned(32))) ={
0,0,0,0,0,2341,4681,9362,11703,14043,16384,13653,10922,8192,5461,2730,0,0,0,0,0,0,0,0
};
short filt24_5_dcr[24] __attribute__((aligned(16))) ={
short filt24_5_dcr[24] __attribute__((aligned(32))) ={
0,0,0,0,0,2730,5461,8192,10922,13653,16384,11703,9362,7022,4681,2730,0,0,0,0,0,0,0,0
};
short filt24_6[24] __attribute__((aligned(16))) ={
short filt24_6[24] __attribute__((aligned(32))) ={
0,0,0,0,0,0,2730,5461,8192,10922,13653,16384,13653,10922,8192,5461,2730,0,0,0,0,0,0,0
};
short filt24_6_dcl[24] __attribute__((aligned(16))) ={
short filt24_6_dcl[24] __attribute__((aligned(32))) ={
0,0,0,0,0,0,4681,7022,9362,11703,14043,16384,13653,10922,8192,5461,2730,0,0,0,0,0,0,0
};
short filt24_6_dcr[24] __attribute__((aligned(16))) ={
short filt24_6_dcr[24] __attribute__((aligned(32))) ={
0,0,0,0,0,0,2730,5461,8192,10922,13653,16384,14043,11703,9362,7022,4681,0,0,0,0,0,0,0
};
short filt24_7[24] __attribute__((aligned(16))) ={
short filt24_7[24] __attribute__((aligned(32))) ={
0,0,0,0,0,0,0,2730,5461,8192,10922,13653,16384,13653,10922,8192,5461,2730,0,0,0,0,0,0
};
short filt24_7_dcl[24] __attribute__((aligned(16))) ={
short filt24_7_dcl[24] __attribute__((aligned(32))) ={
0,0,0,0,0,0,0,4681,7022,9362,11703,14043,16384,13653,10922,8192,5461,2730,0,0,0,0,0,0
};
short filt24_7_dcr[24] __attribute__((aligned(16))) ={
short filt24_7_dcr[24] __attribute__((aligned(32))) ={
0,0,0,0,0,0,0,2730,5461,8192,10922,13653,16384,14043,11703,9362,7022,4681,0,0,0,0,0,0
};
short filt24_0l[24] __attribute__((aligned(16))) ={
short filt24_0l[24] __attribute__((aligned(32))) ={
30037,27306,24576,21845,19114,16384,13653,10922,8192,5461,2730,0,0,0,0,0,0,0,0,0,0,0,0,0
};
short filt24_1l[24] __attribute__((aligned(16))) ={
short filt24_1l[24] __attribute__((aligned(32))) ={
0,30037,27306,24576,21845,19114,16384,13653,10922,8192,5461,2730,0,0,0,0,0,0,0,0,0,0,0,0
};
short filt24_2l[24] __attribute__((aligned(16))) ={
short filt24_2l[24] __attribute__((aligned(32))) ={
0,0,30037,27306,24576,21845,19114,16384,13653,10922,8192,5461,2730,0,0,0,0,0,0,0,0,0,0,0
};
short filt24_3l[24] __attribute__((aligned(16))) ={
short filt24_3l[24] __attribute__((aligned(32))) ={
//0,0,0,30037,27306,24576,21845,19114,16384,13653,10922,8192,5461,2730,0,0,0,0,0,0,0,0,0,0};
0,0,0,0,0,24576,21845,19114,16384,13653,10922,8192,5461,2730,0,0,0,0,0,0,0,0,0,0
};
short filt24_4l[24] __attribute__((aligned(16))) ={
short filt24_4l[24] __attribute__((aligned(32))) ={
0,0,0,0,30037,27306,24576,21845,19114,16384,13653,10922,8192,5461,2730,0,0,0,0,0,0,0,0,0
};
short filt24_5l[24] __attribute__((aligned(16))) ={
short filt24_5l[24] __attribute__((aligned(32))) ={
0,0,0,0,0,30037,27306,24576,21845,19114,16384,13653,10922,8192,5461,2730,0,0,0,0,0,0,0,0
};
short filt24_6l[24] __attribute__((aligned(16))) ={
short filt24_6l[24] __attribute__((aligned(32))) ={
0,0,0,0,0,0,30037,27306,24576,21845,19114,16384,13653,10922,8192,5461,2730,0,0,0,0,0,0,0
};
short filt24_7l[24] __attribute__((aligned(16))) ={
short filt24_7l[24] __attribute__((aligned(32))) ={
0,0,0,0,0,0,0,30037,27306,24576,21845,19114,16384,13653,10922,8192,5461,2730,0,0,0,0,0,0
};
short filt24_0l2[24] __attribute__((aligned(16))) ={
short filt24_0l2[24] __attribute__((aligned(32))) ={
2730,5461,8192,10922,13653,16384,13653,10922,8192,5461,2730,0,0,0,0,0,0,0,0,0,0,0,0,0
};
short filt24_1l2[24] __attribute__((aligned(16))) ={
short filt24_1l2[24] __attribute__((aligned(32))) ={
0,2730,5461,8192,10922,13653,16384,13653,10922,8192,5461,2730,0,0,0,0,0,0,0,0,0,0,0,0
};
short filt24_2l2[24] __attribute__((aligned(16))) ={
short filt24_2l2[24] __attribute__((aligned(32))) ={
-2730,0,2730,5461,8192,10922,13653,16384,13653,10922,8192,5461,2730,0,0,0,0,0,0,0,0,0,0,0
};
short filt24_3l2[24] __attribute__((aligned(16))) ={
short filt24_3l2[24] __attribute__((aligned(32))) ={
-5461,-2730,0,2730,5461,8192,10922,13653,16384,13653,10922,8192,5461,2730,0,0,0,0,0,0,0,0,0,0
};
short filt24_4l2[24] __attribute__((aligned(16))) ={
short filt24_4l2[24] __attribute__((aligned(32))) ={
-8192,-5461,-2730,0,2730,5461,8192,10922,13653,16384,13653,10922,8192,5461,2730,0,0,0,0,0,0,0,0,0
};
short filt24_5l2[24] __attribute__((aligned(16))) ={
short filt24_5l2[24] __attribute__((aligned(32))) ={
0,-8192,-5461,-2730,0,2730,5461,8192,10922,13653,16384,13653,10922,8192,5461,2730,0,0,0,0,0,0,0,0
};
short filt24_6l2[24] __attribute__((aligned(16))) ={
short filt24_6l2[24] __attribute__((aligned(32))) ={
-13653,-10922,-8192,-5461,-2730,0,2730,5461,8192,10922,13653,16384,13653,10922,8192,5461,2730,0,0,0,0,0,0,0
};
short filt24_7l2[24] __attribute__((aligned(16))) ={
short filt24_7l2[24] __attribute__((aligned(32))) ={
0,-13653,-10922,-8192,-5461,-2730,0,2730,5461,8192,10922,13653,16384,13653,10922,8192,5461,2730,0,0,0,0,0,0
};
short filt24_0r[24] __attribute__((aligned(16))) ={
short filt24_0r[24] __attribute__((aligned(32))) ={
2730,5461,8192,10922,13653,16384,19114,21845,24576,27306,30037,0,0,0,0,0,0,0,0,0,0,0,0,0
};
short filt24_1r[24] __attribute__((aligned(16))) ={
short filt24_1r[24] __attribute__((aligned(32))) ={
0,2730,5461,8192,10922,13653,16384,19114,21845,24576,27306,30037,0,0,0,0,0,0,0,0,0,0,0,0
};
short filt24_2r[24] __attribute__((aligned(16))) ={
short filt24_2r[24] __attribute__((aligned(32))) ={
0,0,2730,5461,8192,10922,13653,16384,19114,21845,24576,27306,30037,0,0,0,0,0,0,0,0,0,0,0
};
short filt24_3r[24] __attribute__((aligned(16))) ={
short filt24_3r[24] __attribute__((aligned(32))) ={
0,0,0,2730,5461,8192,10922,13653,16384,19114,21845,24576,27306,30037,0,0,0,0,0,0,0,0,0,0
};
short filt24_4r[24] __attribute__((aligned(16))) ={
short filt24_4r[24] __attribute__((aligned(32))) ={
0,0,0,0,2730,5461,8192,10922,13653,16384,19114,21845,24576,27306,30037,0,0,0,0,0,0,0,0,0
};
short filt24_5r[24] __attribute__((aligned(16))) ={
short filt24_5r[24] __attribute__((aligned(32))) ={
0,0,0,0,0,2730,5461,8192,10922,13653,16384,19114,21845,24576,27306,30037,0,0,0,0,0,0,0,0
};
short filt24_6r[24] __attribute__((aligned(16))) ={
short filt24_6r[24] __attribute__((aligned(32))) ={
0,0,0,0,0,0,2730,5461,8192,10922,13653,16384,19114,21845,24576,27306,30037,0,0,0,0,0,0,0
};
short filt24_7r[24] __attribute__((aligned(16))) ={
short filt24_7r[24] __attribute__((aligned(32))) ={
0,0,0,0,0,0,0,2730,5461,8192,10922,13653,16384,19114,21845,24576,27306,30037,0,0,0,0,0,0
};
short filt24_0r2[24] __attribute__((aligned(16))) ={ /****/
short filt24_0r2[24] __attribute__((aligned(32))) ={ /****/
2730,5461,8192,10922,13653,16384,13653,10922,8192,5461,2730,0,-2730,-5461,-8192,-10922,-13653,0,0,0,0,0,0,0
};
short filt24_1r2[24] __attribute__((aligned(16))) ={
short filt24_1r2[24] __attribute__((aligned(32))) ={
0,2730,5461,8192,10922,13653,16384,13653,10922,8192,5461,2730,0,-2730,-5461,-8192,-10922,-13653,0,0,0,0,0,0
};
short filt24_2r2[24] __attribute__((aligned(16))) ={
short filt24_2r2[24] __attribute__((aligned(32))) ={
0,0,2730,5461,8192,10922,13653,16384,13653,10922,8192,5461,2730,0,-2730,-5461,-8192,-10922,-13653,0,0,0,0,0
};
short filt24_3r2[24] __attribute__((aligned(16))) ={
short filt24_3r2[24] __attribute__((aligned(32))) ={
0,0,0,2730,5461,8192,10922,13653,16384,13653,10922,8192,5461,2730,0,-2730,-5461,-8192,-10922,-13653,0,0,0,0
};
short filt24_4r2[24] __attribute__((aligned(16))) ={
short filt24_4r2[24] __attribute__((aligned(32))) ={
0,0,0,0,2730,5461,8192,10922,13653,16384,13653,10922,8192,5461,2730,0,-2730,-5461,-8192,-10922,-13653,0,0,0
};
short filt24_5r2[24] __attribute__((aligned(16))) ={
short filt24_5r2[24] __attribute__((aligned(32))) ={
0,0,0,0,0,2730,5461,8192,10922,13653,16384,13653,10922,8192,5461,2730,0,-2730,-5461,-8192,-10922,-13653,0,0
};
short filt24_6r2[24] __attribute__((aligned(16))) ={
short filt24_6r2[24] __attribute__((aligned(32))) ={
0,0,0,0,0,0,2730,5461,8192,10922,13653,16384,13653,10922,8192,5461,2730,0,-2730,-5461,-8192,-10922,-13653,0
};
short filt24_7r2[24] __attribute__((aligned(16))) ={
short filt24_7r2[24] __attribute__((aligned(32))) ={
0,0,0,0,0,0,0,2730,5461,8192,10922,13653,16384,13653,10922,8192,5461,2730,0,-2730,-5461,-8192,-10922,-13653
};
......@@ -52,8 +52,8 @@
int* sync_corr_ue0 = NULL;
int* sync_corr_ue1 = NULL;
int* sync_corr_ue2 = NULL;
int sync_tmp[2048*4] __attribute__((aligned(16)));
short syncF_tmp[2048*2] __attribute__((aligned(16)));
int sync_tmp[2048*4] __attribute__((aligned(32)));
short syncF_tmp[2048*2] __attribute__((aligned(32)));
......
......@@ -56,8 +56,8 @@ void lte_sync_timefreq(PHY_VARS_UE *ue,int band,unsigned int DL_freq)
{
#if defined(__x86_64__) || defined(__i386__)
UE_SCAN_INFO_t *scan_info = &ue->scan_info[band];
int16_t spectrum[12288] __attribute__((aligned(16)));
int16_t spectrum_p5ms[12288] __attribute__((aligned(16)));
int16_t spectrum[12288] __attribute__((aligned(32)));
int16_t spectrum_p5ms[12288] __attribute__((aligned(32)));
int i,f,band_idx;
__m128i autocorr0[256/4],autocorr1[256/4],autocorr2[256/4];
__m128i autocorr0_t[256/4],autocorr1_t[256/4],autocorr2_t[256/4];
......
......@@ -186,6 +186,27 @@ uint32_t dlsch_decoding(PHY_VARS_UE *phy_vars_ue,
uint8_t crc_type;
#ifdef DEBUG_DLSCH_DECODING
uint16_t i;
#endif
#ifdef __AVX2__
int Kr_last,skipped_last=0;
uint8_t (*tc_2cw)(int16_t *y,
int16_t *y2,
uint8_t *,
uint8_t *,
uint16_t,
uint16_t,
uint16_t,
uint8_t,
uint8_t,
uint8_t,
time_stats_t *,
time_stats_t *,
time_stats_t *,
time_stats_t *,
time_stats_t *,
time_stats_t *,
time_stats_t *);
#endif
uint8_t (*tc)(int16_t *y,
uint8_t *,
......@@ -203,6 +224,9 @@ uint32_t dlsch_decoding(PHY_VARS_UE *phy_vars_ue,
time_stats_t *,
time_stats_t *);
if (!dlsch_llr) {
printf("dlsch_decoding.c: NULL dlsch_llr pointer\n");
return(dlsch->max_turbo_iterations);
......@@ -223,8 +247,12 @@ uint32_t dlsch_decoding(PHY_VARS_UE *phy_vars_ue,
return(dlsch->max_turbo_iterations);
}
if (llr8_flag == 0)
if (llr8_flag == 0) {
#ifdef __AVX2__
tc_2cw = phy_threegpplte_turbo_decoder16avx2;
#endif
tc = phy_threegpplte_turbo_decoder16;
}
else
tc = phy_threegpplte_turbo_decoder8;
......@@ -300,6 +328,10 @@ uint32_t dlsch_decoding(PHY_VARS_UE *phy_vars_ue,
LOG_E(PHY,"Illegal harq_process->C %d > %d\n",harq_process->C,MAX_NUM_DLSCH_SEGMENTS/bw_scaling);
return((1+dlsch->max_turbo_iterations));
}
#ifdef DEBUG_DLSCH_DECODING
printf("Segmentation: C %d, Cminus %d, Kminus %d, Kplus %d\n",harq_process->C,harq_process->Cminus,harq_process->Kminus,harq_process->Kplus);
#endif
for (r=0; r<harq_process->C; r++) {
......@@ -414,15 +446,11 @@ uint32_t dlsch_decoding(PHY_VARS_UE *phy_vars_ue,
printf("\n");
*/
#ifndef __AVX2__
if (err_flag == 0) {
start_meas(dlsch_turbo_decoding_stats);
#ifdef TURBO_S
ret = phy_threegpplte_turbo_decoder_scalar
#else
ret = tc
#endif
(&harq_process->d[r][96],
harq_process->c[r],
Kr,
......@@ -442,7 +470,130 @@ uint32_t dlsch_decoding(PHY_VARS_UE *phy_vars_ue,
stop_meas(dlsch_turbo_decoding_stats);
}
#else
if ((harq_process->C == 1) ||
((r==harq_process->C-1) && (skipped_last==0))) { // last segment with odd number of segments
start_meas(dlsch_turbo_decoding_stats);
ret = tc
(&harq_process->d[r][96],
harq_process->c[r],
Kr,
f1f2mat_old[iind*2],
f1f2mat_old[(iind*2)+1],
dlsch->max_turbo_iterations,
crc_type,
(r==0) ? harq_process->F : 0,
&phy_vars_ue->dlsch_tc_init_stats,
&phy_vars_ue->dlsch_tc_alpha_stats,
&phy_vars_ue->dlsch_tc_beta_stats,
&phy_vars_ue->dlsch_tc_gamma_stats,
&phy_vars_ue->dlsch_tc_ext_stats,
&phy_vars_ue->dlsch_tc_intl1_stats,
&phy_vars_ue->dlsch_tc_intl2_stats); //(is_crnti==0)?harq_pid:harq_pid+1);
stop_meas(dlsch_turbo_decoding_stats);
// printf("single decode, exit\n");
// exit(-1);
}
else {
// we can merge code segments
if ((skipped_last == 0) && (r<harq_process->C-1)) {
skipped_last = 1;
Kr_last = Kr;
}
else {
skipped_last=0;
if (Kr_last == Kr) { // decode 2 code segments with AVX2 version
#ifdef DEBUG_DLSCH_DECODING
printf("single decoding segment %d (%p)\n",r-1,&harq_process->d[r-1][96]);
#endif
start_meas(dlsch_turbo_decoding_stats);
#ifdef DEBUG_DLSCH_DECODING
printf("double decoding segments %d,%d (%p,%p)\n",r-1,r,&harq_process->d[r-1][96],&harq_process->d[r][96]);
#endif
ret = tc_2cw
(&harq_process->d[r-1][96],
&harq_process->d[r][96],
harq_process->c[r-1],
harq_process->c[r],
Kr,
f1f2mat_old[iind*2],
f1f2mat_old[(iind*2)+1],
dlsch->max_turbo_iterations,
crc_type,
(r==0) ? harq_process->F : 0,
&phy_vars_ue->dlsch_tc_init_stats,
&phy_vars_ue->dlsch_tc_alpha_stats,
&phy_vars_ue->dlsch_tc_beta_stats,
&phy_vars_ue->dlsch_tc_gamma_stats,
&phy_vars_ue->dlsch_tc_ext_stats,
&phy_vars_ue->dlsch_tc_intl1_stats,
&phy_vars_ue->dlsch_tc_intl2_stats); //(is_crnti==0)?harq_pid:harq_pid+1);
/*
ret = tc
(&harq_process->d[r-1][96],
harq_process->c[r-1],
Kr_last,
f1f2mat_old[iind*2],
f1f2mat_old[(iind*2)+1],
dlsch->max_turbo_iterations,
crc_type,
(r==0) ? harq_process->F : 0,
&phy_vars_ue->dlsch_tc_init_stats,
&phy_vars_ue->dlsch_tc_alpha_stats,
&phy_vars_ue->dlsch_tc_beta_stats,
&phy_vars_ue->dlsch_tc_gamma_stats,
&phy_vars_ue->dlsch_tc_ext_stats,
&phy_vars_ue->dlsch_tc_intl1_stats,
&phy_vars_ue->dlsch_tc_intl2_stats); //(is_crnti==0)?harq_pid:harq_pid+1);
exit(-1);*/
stop_meas(dlsch_turbo_decoding_stats);
}
else { // Kr_last != Kr
start_meas(dlsch_turbo_decoding_stats);
ret = tc
(&harq_process->d[r-1][96],
harq_process->c[r-1],
Kr_last,
f1f2mat_old[iind*2],
f1f2mat_old[(iind*2)+1],
dlsch->max_turbo_iterations,
crc_type,
(r==0) ? harq_process->F : 0,
&phy_vars_ue->dlsch_tc_init_stats,
&phy_vars_ue->dlsch_tc_alpha_stats,
&phy_vars_ue->dlsch_tc_beta_stats,
&phy_vars_ue->dlsch_tc_gamma_stats,
&phy_vars_ue->dlsch_tc_ext_stats,
&phy_vars_ue->dlsch_tc_intl1_stats,
&phy_vars_ue->dlsch_tc_intl2_stats); //(is_crnti==0)?harq_pid:harq_pid+1);
stop_meas(dlsch_turbo_decoding_stats);
start_meas(dlsch_turbo_decoding_stats);
ret = tc
(&harq_process->d[r][96],
harq_process->c[r],
Kr,
f1f2mat_old[iind*2],
f1f2mat_old[(iind*2)+1],
dlsch->max_turbo_iterations,
crc_type,
(r==0) ? harq_process->F : 0,
&phy_vars_ue->dlsch_tc_init_stats,
&phy_vars_ue->dlsch_tc_alpha_stats,
&phy_vars_ue->dlsch_tc_beta_stats,
&phy_vars_ue->dlsch_tc_gamma_stats,
&phy_vars_ue->dlsch_tc_ext_stats,
&phy_vars_ue->dlsch_tc_intl1_stats,
&phy_vars_ue->dlsch_tc_intl2_stats); //(is_crnti==0)?harq_pid:harq_pid+1);
stop_meas(dlsch_turbo_decoding_stats);
}
}
}
#endif
if ((err_flag == 0) && (ret>=(1+dlsch->max_turbo_iterations))) {// a Code segment is in error so break;
......
......@@ -79,9 +79,9 @@ extern int exit_openair;
//extern void do_OFDM_mod(mod_sym_t **txdataF, int32_t **txdata, uint32_t frame, uint16_t next_slot, LTE_DL_FRAME_PARMS *frame_parms);
unsigned char dlsch_input_buffer[2700] __attribute__ ((aligned(16)));
int eNB_sync_buffer0[640*6] __attribute__ ((aligned(16)));
int eNB_sync_buffer1[640*6] __attribute__ ((aligned(16)));
unsigned char dlsch_input_buffer[2700] __attribute__ ((aligned(32)));
int eNB_sync_buffer0[640*6] __attribute__ ((aligned(32)));
int eNB_sync_buffer1[640*6] __attribute__ ((aligned(32)));
int *eNB_sync_buffer[2] = {eNB_sync_buffer0, eNB_sync_buffer1};
extern uint16_t hundred_times_log10_NPRB[100];
......
Markdown is supported
0%
or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment