Commit b1e28edd authored by Raymond Knopp's avatar Raymond Knopp

Merge branch 'enhancement-43-AVX2' into bugfix-48-L1L2signaling

corrected some additional FFT related issues from AVX2 merge when running with real-time MODEM.

Conflicts:
	openair1/PHY/LTE_TRANSPORT/dlsch_demodulation.c
parents 801e343c c0f6881c
...@@ -174,7 +174,7 @@ set(CMAKE_EXE_LINKER_FLAGS "${CMAKE_EXE_LINKER_FLAGS} -Wl,-rpath -Wl,${OPENAIR_ ...@@ -174,7 +174,7 @@ set(CMAKE_EXE_LINKER_FLAGS "${CMAKE_EXE_LINKER_FLAGS} -Wl,-rpath -Wl,${OPENAIR_
# these changes are related to hardcoded path to include .h files # these changes are related to hardcoded path to include .h files
add_definitions(-DCMAKER) add_definitions(-DCMAKER)
set(CMAKE_C_FLAGS_DEBUG "${CMAKE_C_FLAGS} -g -DMALLOC_CHECK_=3") set(CMAKE_C_FLAGS_DEBUG "${CMAKE_C_FLAGS} -g -DMALLOC_CHECK_=3")
set(CMAKE_C_FLAGS_RELWITHDEBINFO "${CMAKE_C_FLAGS} -g -DMALLOC_CHECK_=3 -O2") set(CMAKE_C_FLAGS_RELWITHDEBINFO "${CMAKE_C_FLAGS} -g -DMALLOC_CHECK_=3 -O3")
set(GIT_BRANCH "UNKNOWN") set(GIT_BRANCH "UNKNOWN")
...@@ -949,6 +949,7 @@ set(PHY_SRC ...@@ -949,6 +949,7 @@ set(PHY_SRC
${OPENAIR1_DIR}/PHY/CODING/crc_byte.c ${OPENAIR1_DIR}/PHY/CODING/crc_byte.c
${OPENAIR1_DIR}/PHY/CODING/3gpplte_turbo_decoder_sse_8bit.c ${OPENAIR1_DIR}/PHY/CODING/3gpplte_turbo_decoder_sse_8bit.c
${OPENAIR1_DIR}/PHY/CODING/3gpplte_turbo_decoder_sse_16bit.c ${OPENAIR1_DIR}/PHY/CODING/3gpplte_turbo_decoder_sse_16bit.c
${OPENAIR1_DIR}/PHY/CODING/3gpplte_turbo_decoder_avx2_16bit.c
${OPENAIR1_DIR}/PHY/CODING/lte_rate_matching.c ${OPENAIR1_DIR}/PHY/CODING/lte_rate_matching.c
${OPENAIR1_DIR}/PHY/CODING/rate_matching.c ${OPENAIR1_DIR}/PHY/CODING/rate_matching.c
${OPENAIR1_DIR}/PHY/CODING/viterbi.c ${OPENAIR1_DIR}/PHY/CODING/viterbi.c
......
...@@ -37,6 +37,7 @@ ...@@ -37,6 +37,7 @@
#include "extern_3GPPinterleaver.h" #include "extern_3GPPinterleaver.h"
#else #else
#include "vars.h" #include "vars.h"
#include <stdint.h>
#endif #endif
#include <stdio.h> #include <stdio.h>
#include <string.h> #include <string.h>
...@@ -48,6 +49,7 @@ ...@@ -48,6 +49,7 @@
#define print_shorts(s,x) printf("%s %x,%x,%x,%x,%x,%x,%x,%x\n",s,(x)[0],(x)[1],(x)[2],(x)[3],(x)[4],(x)[5],(x)[6],(x)[7]) #define print_shorts(s,x) printf("%s %x,%x,%x,%x,%x,%x,%x,%x\n",s,(x)[0],(x)[1],(x)[2],(x)[3],(x)[4],(x)[5],(x)[6],(x)[7])
#define print_ints(s,x) printf("%s %x %x %x %x\n",s,(x)[0],(x)[1],(x)[2],(x)[3]) #define print_ints(s,x) printf("%s %x %x %x %x\n",s,(x)[0],(x)[1],(x)[2],(x)[3])
#define print_bytes2(s,x) printf("%s %x,%x,%x,%x,%x,%x,%x,%x,%x,%x,%x,%x,%x,%x,%x,%x,%x,%x,%x,%x,%x,%x,%x,%x,%x,%x,%x,%x,%x,%x,%x,%x\n",s,(x)[0],(x)[1],(x)[2],(x)[3],(x)[4],(x)[5],(x)[6],(x)[7],(x)[8],(x)[9],(x)[10],(x)[11],(x)[12],(x)[13],(x)[14],(x)[15],(x)[16],(x)[17],(x)[18],(x)[19],(x)[20],(x)[21],(x)[22],(x)[23],(x)[24],(x)[25],(x)[26],(x)[27],(x)[28],(x)[29],(x)[30],(x)[31])
//#define DEBUG_TURBO_ENCODER 1 //#define DEBUG_TURBO_ENCODER 1
#define CALLGRIND 1 #define CALLGRIND 1
...@@ -57,16 +59,12 @@ unsigned long long threegpplte_interleaver_tmp; ...@@ -57,16 +59,12 @@ unsigned long long threegpplte_interleaver_tmp;
#if defined(__x86_64__) || defined(__i386__) #if defined(__x86_64__) || defined(__i386__)
struct treillis { struct treillis {
union { union {
__m64 systematic_64[3]; __m64 systematic_andp1_64[3];
char systematic_8[24]; uint8_t systematic_andp1_8[24];
};
union {
__m64 parity1_64[3];
char parity1_8[24];
}; };
union { union {
__m64 parity2_64[3]; __m64 parity2_64[3];
char parity2_8[24]; uint8_t parity2_8[24];
}; };
int exit_state; int exit_state;
} __attribute__ ((aligned(64))); } __attribute__ ((aligned(64)));
...@@ -75,12 +73,8 @@ struct treillis { ...@@ -75,12 +73,8 @@ struct treillis {
struct treillis { struct treillis {
union { union {
uint8x8_t systematic_64[3]; uint8x8_t systematic_andp1_64[3];
char systematic_8[24]; char systematic_andp1_8[24];
}__attribute__((aligned(64)));
union {
uint8x8_t parity1_64[3];
char parity1_8[24];
}__attribute__((aligned(64))); }__attribute__((aligned(64)));
union { union {
uint8x8_t parity2_64[3]; uint8x8_t parity2_64[3];
...@@ -91,6 +85,7 @@ struct treillis { ...@@ -91,6 +85,7 @@ struct treillis {
#endif #endif
struct treillis all_treillis[8][256]; struct treillis all_treillis[8][256];
int all_treillis_initialized=0; int all_treillis_initialized=0;
static inline unsigned char threegpplte_rsc(unsigned char input,unsigned char *state) static inline unsigned char threegpplte_rsc(unsigned char input,unsigned char *state)
...@@ -116,18 +111,20 @@ void treillis_table_init(void) ...@@ -116,18 +111,20 @@ void treillis_table_init(void)
unsigned char v, current_state; unsigned char v, current_state;
// clear all_treillis // clear all_treillis
for (i=0; i<8; i++) for (i=0; i<8; i++) {
bzero( all_treillis[i], sizeof(all_treillis[0]) ); bzero( all_treillis[i], sizeof(all_treillis[0]) );
}
for (i=0; i<8; i++) { //all possible initial states for (i=0; i<8; i++) { //all possible initial states
for (j=0; j<=255; j++) { // all possible values of a byte for (j=0; j<=255; j++) { // all possible values of a byte
current_state=i; current_state=i;
for (b=0; b<8 ; b++ ) { // pre-compute the image of the byte j in _m128i vector right place for (b=0; b<8 ; b++ ) { // pre-compute the image of the byte j in _m128i vector right place
all_treillis[i][j].systematic_8[b*3]= (j&(1<<(7-b)))>>(7-b); all_treillis[i][j].systematic_andp1_8[b*3]= (j&(1<<(7-b)))>>(7-b);
v=threegpplte_rsc( all_treillis[i][j].systematic_8[b*3] , v=threegpplte_rsc( all_treillis[i][j].systematic_andp1_8[b*3] ,
&current_state); &current_state);
all_treillis[i][j].parity1_8[b*3+1]=v; // for the yparity1 all_treillis[i][j].systematic_andp1_8[b*3+1]=v; // for the yparity1
// all_treillis[i][j].parity1_8[b*3+1]=v; // for the yparity1
all_treillis[i][j].parity2_8[b*3+2]=v; // for the yparity2 all_treillis[i][j].parity2_8[b*3+2]=v; // for the yparity2
} }
...@@ -143,9 +140,10 @@ void treillis_table_init(void) ...@@ -143,9 +140,10 @@ void treillis_table_init(void)
char interleave_compact_byte(short * base_interleaver,unsigned char * input, unsigned char * output, int n) char interleave_compact_byte(short * base_interleaver,unsigned char * input, unsigned char * output, int n)
{ {
char expandInput[768*8] __attribute__((aligned(16))); char expandInput[768*8] __attribute__((aligned(32)));
int i,loop=n>>4; int i,loop=n>>4;
#if defined(__x86_64__) || defined(__i386__) #if defined(__x86_64__) || defined(__i386__)
#ifndef __AVX2__
__m128i *i_128=(__m128i *)input, *o_128=(__m128i*)expandInput; __m128i *i_128=(__m128i *)input, *o_128=(__m128i*)expandInput;
__m128i tmp1, tmp2, tmp3, tmp4; __m128i tmp1, tmp2, tmp3, tmp4;
__m128i BIT_MASK = _mm_set_epi8( 0b00000001, __m128i BIT_MASK = _mm_set_epi8( 0b00000001,
...@@ -164,6 +162,43 @@ char interleave_compact_byte(short * base_interleaver,unsigned char * input, uns ...@@ -164,6 +162,43 @@ char interleave_compact_byte(short * base_interleaver,unsigned char * input, uns
0b00100000, 0b00100000,
0b01000000, 0b01000000,
0b10000000); 0b10000000);
#else
__m256i *i_256=(__m256i *)input, *o_256=(__m256i*)expandInput;
__m256i tmp1, tmp2, tmp3, tmp4, tmp5, tmp6, tmp7;
__m256i BIT_MASK = _mm256_set_epi8( 0b00000001,
0b00000010,
0b00000100,
0b00001000,
0b00010000,
0b00100000,
0b01000000,
0b10000000,
0b00000001,
0b00000010,
0b00000100,
0b00001000,
0b00010000,
0b00100000,
0b01000000,
0b10000000,
0b00000001,
0b00000010,
0b00000100,
0b00001000,
0b00010000,
0b00100000,
0b01000000,
0b10000000,
0b00000001,
0b00000010,
0b00000100,
0b00001000,
0b00010000,
0b00100000,
0b01000000,
0b10000000);
#endif
#elif defined(__arm__) #elif defined(__arm__)
uint8x16_t *i_128=(uint8x16_t *)input, *o_128=(uint8x16_t *)expandInput; uint8x16_t *i_128=(uint8x16_t *)input, *o_128=(uint8x16_t *)expandInput;
uint8x16_t tmp1,tmp2; uint8x16_t tmp1,tmp2;
...@@ -187,46 +222,126 @@ char interleave_compact_byte(short * base_interleaver,unsigned char * input, uns ...@@ -187,46 +222,126 @@ char interleave_compact_byte(short * base_interleaver,unsigned char * input, uns
0b00000010, 0b00000010,
0b00000001}; 0b00000001};
#endif #endif
#ifndef __AVX2__
if ((n&15) > 0) if ((n&15) > 0)
loop++; loop++;
#else
loop=n>>5;
if ((n&31) > 0)
loop++;
#endif
for (i=0; i<loop ; i++ ) { for (i=0; i<loop ; i++ ) {
/* int cur_byte=i<<3; */ // int cur_byte=i<<3;
/* for (b=0;b<8;b++) */ // for (b=0;b<8;b++)
/* expandInput[cur_byte+b] = (input[i]&(1<<(7-b)))>>(7-b); */ // expandInput[cur_byte+b] = (input[i]&(1<<(7-b)))>>(7-b);
#if defined(__x86_64__) || defined(__i386__) #if defined(__x86_64__) || defined(__i386__)
tmp1=_mm_load_si128(i_128++); #ifndef __AVX2__
tmp2=_mm_unpacklo_epi8(tmp1,tmp1); tmp1=_mm_load_si128(i_128++); // tmp1 = B0,B1,...,B15
tmp3=_mm_unpacklo_epi16(tmp2,tmp2); tmp2=_mm_unpacklo_epi8(tmp1,tmp1); // tmp2 = B0,B0,B1,B1,...,B7,B7
tmp4=_mm_unpacklo_epi32(tmp3,tmp3); tmp3=_mm_unpacklo_epi16(tmp2,tmp2); // tmp3 = B0,B0,B0,B0,B1,B1,B1,B1,B2,B2,B2,B2,B3,B3,B3,B3
tmp4=_mm_unpacklo_epi32(tmp3,tmp3); // tmp4 - B0,B0,B0,B0,B0,B0,B0,B0,B1,B1,B1,B1,B1,B1,B1,B1
*o_128++=_mm_cmpeq_epi8(_mm_and_si128(tmp4,BIT_MASK),BIT_MASK); *o_128++=_mm_cmpeq_epi8(_mm_and_si128(tmp4,BIT_MASK),BIT_MASK);
tmp4=_mm_unpackhi_epi32(tmp3,tmp3); tmp4=_mm_unpackhi_epi32(tmp3,tmp3); // tmp4 - B2,B2,B2,B2,B2,B2,B2,B2,B3,B3,B3,B3,B3,B3,B3,B3
*o_128++=_mm_cmpeq_epi8(_mm_and_si128(tmp4,BIT_MASK),BIT_MASK);; *o_128++=_mm_cmpeq_epi8(_mm_and_si128(tmp4,BIT_MASK),BIT_MASK);;
tmp3=_mm_unpackhi_epi16(tmp2,tmp2); tmp3=_mm_unpackhi_epi16(tmp2,tmp2); // tmp3 = B4,B4,B4,B4,B5,B5,B5,B5,B6,B6,B6,B6,B7,B7,B7,B7
tmp4=_mm_unpacklo_epi32(tmp3,tmp3); tmp4=_mm_unpacklo_epi32(tmp3,tmp3); // tmp4 - B4,B4,B4,B4,B4,B4,B4,B4,B5,B5,B5,B5,B5,B5,B5,B5
*o_128++=_mm_cmpeq_epi8(_mm_and_si128(tmp4,BIT_MASK),BIT_MASK);; *o_128++=_mm_cmpeq_epi8(_mm_and_si128(tmp4,BIT_MASK),BIT_MASK);;
tmp4=_mm_unpackhi_epi32(tmp3,tmp3); tmp4=_mm_unpackhi_epi32(tmp3,tmp3); // tmp4 - B6,B6,B6,B6,B6,B6,B6,B6,B7,B7,B7,B7,B7,B7,B7,B7
*o_128++=_mm_cmpeq_epi8(_mm_and_si128(tmp4,BIT_MASK),BIT_MASK);; *o_128++=_mm_cmpeq_epi8(_mm_and_si128(tmp4,BIT_MASK),BIT_MASK);;
tmp2=_mm_unpackhi_epi8(tmp1,tmp1); tmp2=_mm_unpackhi_epi8(tmp1,tmp1); // tmp2 = B8,B8,B9,B9,...,B15,B15
tmp3=_mm_unpacklo_epi16(tmp2,tmp2); tmp3=_mm_unpacklo_epi16(tmp2,tmp2); // tmp3 = B8,B8,B8,B8,B9,B9,B9,B9,B10,B10,B10,B10,B11,B11,B11,B11
tmp4=_mm_unpacklo_epi32(tmp3,tmp3); tmp4=_mm_unpacklo_epi32(tmp3,tmp3); // tmp4 = B8,B8,B8,B8,B8,B8,B8,B8,B9,B9,B9,B9,B9,B9,B9,B9
*o_128++=_mm_cmpeq_epi8(_mm_and_si128(tmp4,BIT_MASK),BIT_MASK);; *o_128++=_mm_cmpeq_epi8(_mm_and_si128(tmp4,BIT_MASK),BIT_MASK);;
tmp4=_mm_unpackhi_epi32(tmp3,tmp3); tmp4=_mm_unpackhi_epi32(tmp3,tmp3); // tmp4 = B10,B10,B10,B10,B10,B10,B10,B10,B11,B11,B11,B11,B11,B11,B11,B11
*o_128++=_mm_cmpeq_epi8(_mm_and_si128(tmp4,BIT_MASK),BIT_MASK);; *o_128++=_mm_cmpeq_epi8(_mm_and_si128(tmp4,BIT_MASK),BIT_MASK);;
tmp3=_mm_unpackhi_epi16(tmp2,tmp2); tmp3=_mm_unpackhi_epi16(tmp2,tmp2); // tmp3 = B12,B12,B12,B12,B13,B13,B13,B13,B14,B14,B14,B14,B15,B15,B15,B15
tmp4=_mm_unpacklo_epi32(tmp3,tmp3); tmp4=_mm_unpacklo_epi32(tmp3,tmp3); // tmp4 = B12,B12,B12,B12,B12,B12,B12,B12,B13,B13,B13,B13,B13,B13,B13,B13
*o_128++=_mm_cmpeq_epi8(_mm_and_si128(tmp4,BIT_MASK),BIT_MASK);; *o_128++=_mm_cmpeq_epi8(_mm_and_si128(tmp4,BIT_MASK),BIT_MASK);;
tmp4=_mm_unpackhi_epi32(tmp3,tmp3); tmp4=_mm_unpackhi_epi32(tmp3,tmp3); // tmp4 = B14,B14,B14,B14,B14,B14,B14,B14,B15,B15,B15,B15,B15,B15,B15,B15
*o_128++=_mm_cmpeq_epi8(_mm_and_si128(tmp4,BIT_MASK),BIT_MASK);; *o_128++=_mm_cmpeq_epi8(_mm_and_si128(tmp4,BIT_MASK),BIT_MASK);;
#else
tmp1=_mm256_load_si256(i_256++); // tmp1 = B0,B1,...,B15,...,B31
//print_bytes2("in",(uint8_t*)&tmp1);
tmp2=_mm256_unpacklo_epi8(tmp1,tmp1); // tmp2 = B0,B0,B1,B1,...,B7,B7,B16,B16,B17,B17,...,B23,B23
tmp3=_mm256_unpacklo_epi16(tmp2,tmp2); // tmp3 = B0,B0,B0,B0,B1,B1,B1,B1,B2,B2,B2,B2,B3,B3,B3,B3,B16,B16,B16,B16,...,B19,B19,B19,B19
tmp4=_mm256_unpacklo_epi32(tmp3,tmp3); // tmp4 - B0,B0,B0,B0,B0,B0,B0,B0,B1,B1,B1,B1,B1,B1,B1,B1,B16,B16...,B17..,B17
tmp5=_mm256_unpackhi_epi32(tmp3,tmp3); // tmp5 - B2,B2,B2,B2,B2,B2,B2,B2,B3,B3,B3,B3,B3,B3,B3,B3,B18...,B18,B19,...,B19
tmp6=_mm256_insertf128_si256(tmp4,_mm256_extracti128_si256(tmp5,0),1); // tmp6 = B0 B1 B2 B3
tmp7=_mm256_insertf128_si256(tmp5,_mm256_extracti128_si256(tmp4,1),0); // tmp7 = B16 B17 B18 B19
//print_bytes2("tmp2",(uint8_t*)&tmp2);
//print_bytes2("tmp3",(uint8_t*)&tmp3);
//print_bytes2("tmp4",(uint8_t*)&tmp4);
//print_bytes2("tmp5",(uint8_t*)&tmp4);
//print_bytes2("tmp6",(uint8_t*)&tmp6);
//print_bytes2("tmp7",(uint8_t*)&tmp7);
o_256[0]=_mm256_cmpeq_epi8(_mm256_and_si256(tmp6,BIT_MASK),BIT_MASK);
//print_bytes2("out",(uint8_t*)o_256);
o_256[4]=_mm256_cmpeq_epi8(_mm256_and_si256(tmp7,BIT_MASK),BIT_MASK);;
//print_bytes2("out",(uint8_t*)(o_256+4));
tmp3=_mm256_unpackhi_epi16(tmp2,tmp2); // tmp3 = B4,B4,B4,B4,B5,B5,B5,B5,B6,B6,B6,B6,B7,B7,B7,B7,B20,B20,B20,B20,...,B23,B23,B23,B23
tmp4=_mm256_unpacklo_epi32(tmp3,tmp3); // tmp4 - B4,B4,B4,B4,B4,B4,B4,B4,B5,B5,B5,B5,B5,B5,B5,B5,B20,B20...,B21..,B21
tmp5=_mm256_unpackhi_epi32(tmp3,tmp3); // tmp5 - B6,B6,B6,B6,B6,B6,B6,B6,B7,B7,B7,B7,B7,B7,B7,B7,B22...,B22,B23,...,B23
tmp6=_mm256_insertf128_si256(tmp4,_mm256_extracti128_si256(tmp5,0),1); // tmp6 = B4 B5 B6 B7
tmp7=_mm256_insertf128_si256(tmp5,_mm256_extracti128_si256(tmp4,1),0); // tmp7 = B20 B21 B22 B23
//print_bytes2("tmp2",(uint8_t*)&tmp2);
//print_bytes2("tmp3",(uint8_t*)&tmp3);
//print_bytes2("tmp4",(uint8_t*)&tmp4);
//print_bytes2("tmp5",(uint8_t*)&tmp4);
//print_bytes2("tmp6",(uint8_t*)&tmp6);
//print_bytes2("tmp7",(uint8_t*)&tmp7);
o_256[1]=_mm256_cmpeq_epi8(_mm256_and_si256(tmp6,BIT_MASK),BIT_MASK);
//print_bytes2("out",(uint8_t*)(o_256+1));
o_256[5]=_mm256_cmpeq_epi8(_mm256_and_si256(tmp7,BIT_MASK),BIT_MASK);;
//print_bytes2("out",(uint8_t*)(o_256+4));
tmp2=_mm256_unpackhi_epi8(tmp1,tmp1); // tmp2 = B8 B9 B10 B11 B12 B13 B14 B15 B25 B26 B27 B28 B29 B30 B31
tmp3=_mm256_unpacklo_epi16(tmp2,tmp2); // tmp3 = B8,B9,B10,B11,B26,B27,B28,B29
tmp4=_mm256_unpacklo_epi32(tmp3,tmp3); // tmp4 - B8,B9,B26,B27
tmp5=_mm256_unpackhi_epi32(tmp3,tmp3); // tmp5 - B10,B11,B28,B29
tmp6=_mm256_insertf128_si256(tmp4,_mm256_extracti128_si256(tmp5,0),1); // tmp6 = B8 B9 B10 B11
tmp7=_mm256_insertf128_si256(tmp5,_mm256_extracti128_si256(tmp4,1),0); // tmp7 = B26 B27 B28 B29
//print_bytes2("tmp2",(uint8_t*)&tmp2);
//print_bytes2("tmp3",(uint8_t*)&tmp3);
//print_bytes2("tmp4",(uint8_t*)&tmp4);
//print_bytes2("tmp5",(uint8_t*)&tmp4);
//print_bytes2("tmp6",(uint8_t*)&tmp6);
//print_bytes2("tmp7",(uint8_t*)&tmp7);
o_256[2]=_mm256_cmpeq_epi8(_mm256_and_si256(tmp6,BIT_MASK),BIT_MASK);
//print_bytes2("out",(uint8_t*)(o_256+2));
o_256[6]=_mm256_cmpeq_epi8(_mm256_and_si256(tmp7,BIT_MASK),BIT_MASK);;
//print_bytes2("out",(uint8_t*)(o_256+4));
tmp3=_mm256_unpackhi_epi16(tmp2,tmp2); // tmp3 = B12 B13 B14 B15 B28 B29 B30 B31
tmp4=_mm256_unpacklo_epi32(tmp3,tmp3); // tmp4 = B12 B13 B28 B29
tmp5=_mm256_unpackhi_epi32(tmp3,tmp3); // tmp5 = B14 B15 B30 B31
tmp6=_mm256_insertf128_si256(tmp4,_mm256_extracti128_si256(tmp5,0),1); // tmp6 = B12 B13 B14 B15
tmp7=_mm256_insertf128_si256(tmp5,_mm256_extracti128_si256(tmp4,1),0); // tmp7 = B28 B29 B30 B31
//print_bytes2("tmp2",(uint8_t*)&tmp2);
//print_bytes2("tmp3",(uint8_t*)&tmp3);
//print_bytes2("tmp4",(uint8_t*)&tmp4);
//print_bytes2("tmp5",(uint8_t*)&tmp4);
//print_bytes2("tmp6",(uint8_t*)&tmp6);
//print_bytes2("tmp7",(uint8_t*)&tmp7);
o_256[3]=_mm256_cmpeq_epi8(_mm256_and_si256(tmp6,BIT_MASK),BIT_MASK);
//print_bytes2("out",(uint8_t*)(o_256+3));
o_256[7]=_mm256_cmpeq_epi8(_mm256_and_si256(tmp7,BIT_MASK),BIT_MASK);;
//print_bytes2("out",(uint8_t*)(o_256+7));
o_256+=8;
#endif
#elif defined(__arm__) #elif defined(__arm__)
tmp1=vld1q_u8((uint8_t*)i_128); tmp1=vld1q_u8((uint8_t*)i_128);
//print_bytes("tmp1:",(uint8_t*)&tmp1); //print_bytes("tmp1:",(uint8_t*)&tmp1);
...@@ -302,11 +417,17 @@ char interleave_compact_byte(short * base_interleaver,unsigned char * input, uns ...@@ -302,11 +417,17 @@ char interleave_compact_byte(short * base_interleaver,unsigned char * input, uns
i_128++; i_128++;
#endif #endif
} }
short * ptr_intl=base_interleaver; short * ptr_intl=base_interleaver;
#if defined(__x86_64) || defined(__i386__) #if defined(__x86_64) || defined(__i386__)
#ifndef __AVX2__
__m128i tmp; __m128i tmp;
uint16_t *systematic2_ptr=(unsigned short *) output; uint16_t *systematic2_ptr=(uint16_t *) output;
#else
__m256i tmp;
uint32_t *systematic2_ptr=(uint32_t *) output;
#endif
#elif defined(__arm__) #elif defined(__arm__)
uint8x16_t tmp; uint8x16_t tmp;
const uint8_t __attribute__ ((aligned (16))) _Powers[16]= const uint8_t __attribute__ ((aligned (16))) _Powers[16]=
...@@ -316,11 +437,15 @@ char interleave_compact_byte(short * base_interleaver,unsigned char * input, uns ...@@ -316,11 +437,15 @@ char interleave_compact_byte(short * base_interleaver,unsigned char * input, uns
uint8x16_t Powers= vld1q_u8(_Powers); uint8x16_t Powers= vld1q_u8(_Powers);
uint8_t *systematic2_ptr=(uint8_t *) output; uint8_t *systematic2_ptr=(uint8_t *) output;
#endif #endif
#ifndef __AVX2__
int input_length_words=n>>1; int input_length_words=n>>1;
#else
int input_length_words=n>>2;
#endif
for ( i=0; i< input_length_words ; i ++ ) { for ( i=0; i< input_length_words ; i ++ ) {
#if defined(__x86_64__) || defined(__i386__) #if defined(__x86_64__) || defined(__i386__)
#ifndef __AVX2__
tmp=_mm_insert_epi8(tmp,expandInput[*ptr_intl++],7); tmp=_mm_insert_epi8(tmp,expandInput[*ptr_intl++],7);
tmp=_mm_insert_epi8(tmp,expandInput[*ptr_intl++],6); tmp=_mm_insert_epi8(tmp,expandInput[*ptr_intl++],6);
tmp=_mm_insert_epi8(tmp,expandInput[*ptr_intl++],5); tmp=_mm_insert_epi8(tmp,expandInput[*ptr_intl++],5);
...@@ -338,6 +463,45 @@ char interleave_compact_byte(short * base_interleaver,unsigned char * input, uns ...@@ -338,6 +463,45 @@ char interleave_compact_byte(short * base_interleaver,unsigned char * input, uns
tmp=_mm_insert_epi8(tmp,expandInput[*ptr_intl++],8+1); tmp=_mm_insert_epi8(tmp,expandInput[*ptr_intl++],8+1);
tmp=_mm_insert_epi8(tmp,expandInput[*ptr_intl++],8+0); tmp=_mm_insert_epi8(tmp,expandInput[*ptr_intl++],8+0);
*systematic2_ptr++=(unsigned short)_mm_movemask_epi8(tmp); *systematic2_ptr++=(unsigned short)_mm_movemask_epi8(tmp);
#else
tmp=_mm256_insert_epi8(tmp,expandInput[*ptr_intl++],7);
tmp=_mm256_insert_epi8(tmp,expandInput[*ptr_intl++],6);
tmp=_mm256_insert_epi8(tmp,expandInput[*ptr_intl++],5);
tmp=_mm256_insert_epi8(tmp,expandInput[*ptr_intl++],4);
tmp=_mm256_insert_epi8(tmp,expandInput[*ptr_intl++],3);
tmp=_mm256_insert_epi8(tmp,expandInput[*ptr_intl++],2);
tmp=_mm256_insert_epi8(tmp,expandInput[*ptr_intl++],1);
tmp=_mm256_insert_epi8(tmp,expandInput[*ptr_intl++],0);
tmp=_mm256_insert_epi8(tmp,expandInput[*ptr_intl++],8+7);
tmp=_mm256_insert_epi8(tmp,expandInput[*ptr_intl++],8+6);
tmp=_mm256_insert_epi8(tmp,expandInput[*ptr_intl++],8+5);
tmp=_mm256_insert_epi8(tmp,expandInput[*ptr_intl++],8+4);
tmp=_mm256_insert_epi8(tmp,expandInput[*ptr_intl++],8+3);
tmp=_mm256_insert_epi8(tmp,expandInput[*ptr_intl++],8+2);
tmp=_mm256_insert_epi8(tmp,expandInput[*ptr_intl++],8+1);
tmp=_mm256_insert_epi8(tmp,expandInput[*ptr_intl++],8+0);
tmp=_mm256_insert_epi8(tmp,expandInput[*ptr_intl++],16+7);
tmp=_mm256_insert_epi8(tmp,expandInput[*ptr_intl++],16+6);
tmp=_mm256_insert_epi8(tmp,expandInput[*ptr_intl++],16+5);
tmp=_mm256_insert_epi8(tmp,expandInput[*ptr_intl++],16+4);
tmp=_mm256_insert_epi8(tmp,expandInput[*ptr_intl++],16+3);
tmp=_mm256_insert_epi8(tmp,expandInput[*ptr_intl++],16+2);
tmp=_mm256_insert_epi8(tmp,expandInput[*ptr_intl++],16+1);
tmp=_mm256_insert_epi8(tmp,expandInput[*ptr_intl++],16+0);
tmp=_mm256_insert_epi8(tmp,expandInput[*ptr_intl++],24+7);
tmp=_mm256_insert_epi8(tmp,expandInput[*ptr_intl++],24+6);
tmp=_mm256_insert_epi8(tmp,expandInput[*ptr_intl++],24+5);
tmp=_mm256_insert_epi8(tmp,expandInput[*ptr_intl++],24+4);
tmp=_mm256_insert_epi8(tmp,expandInput[*ptr_intl++],24+3);
tmp=_mm256_insert_epi8(tmp,expandInput[*ptr_intl++],24+2);
tmp=_mm256_insert_epi8(tmp,expandInput[*ptr_intl++],24+1);
tmp=_mm256_insert_epi8(tmp,expandInput[*ptr_intl++],24+0);
*systematic2_ptr++=(unsigned int)_mm256_movemask_epi8(tmp);
#endif
#elif defined(__arm__) #elif defined(__arm__)
tmp=vsetq_lane_u8(expandInput[*ptr_intl++],tmp,7); tmp=vsetq_lane_u8(expandInput[*ptr_intl++],tmp,7);
tmp=vsetq_lane_u8(expandInput[*ptr_intl++],tmp,6); tmp=vsetq_lane_u8(expandInput[*ptr_intl++],tmp,6);
...@@ -391,8 +555,9 @@ void threegpplte_turbo_encoder(unsigned char *input, ...@@ -391,8 +555,9 @@ void threegpplte_turbo_encoder(unsigned char *input,
unsigned short input_length_bits = input_length_bytes<<3; unsigned short input_length_bits = input_length_bytes<<3;
short * base_interleaver; short * base_interleaver;
if ( all_treillis_initialized == 0 ) if ( all_treillis_initialized == 0 ) {
treillis_table_init(); treillis_table_init();
}
// look for f1 and f2 precomputed interleaver values // look for f1 and f2 precomputed interleaver values
for (i=0; i < 188 && f1f2mat[i].nb_bits != input_length_bits; i++); for (i=0; i < 188 && f1f2mat[i].nb_bits != input_length_bits; i++);
...@@ -405,7 +570,7 @@ void threegpplte_turbo_encoder(unsigned char *input, ...@@ -405,7 +570,7 @@ void threegpplte_turbo_encoder(unsigned char *input,
} }
unsigned char systematic2[768]; unsigned char systematic2[768] __attribute__((aligned(32)));
interleave_compact_byte(base_interleaver,input,systematic2,input_length_bytes); interleave_compact_byte(base_interleaver,input,systematic2,input_length_bytes);
#if defined(__x86_64__) || defined(__i386__) #if defined(__x86_64__) || defined(__i386__)
...@@ -419,22 +584,26 @@ void threegpplte_turbo_encoder(unsigned char *input, ...@@ -419,22 +584,26 @@ void threegpplte_turbo_encoder(unsigned char *input,
for ( state0=state1=i=0 ; i<input_length_bytes; i++ ) { for ( state0=state1=i=0 ; i<input_length_bytes; i++ ) {
cur_s1=input[i]; cur_s1=input[i];
cur_s2=systematic2[i]; cur_s2=systematic2[i];
for ( code_rate=0; code_rate<3; code_rate++) { for ( code_rate=0; code_rate<3; code_rate++) {
#if defined(__x86_64__) || defined(__i386__) #if defined(__x86_64__) || defined(__i386__)
*ptr_output++ = _mm_add_pi8(all_treillis[state0][cur_s1].systematic_64[code_rate], /*
_mm_add_pi8(all_treillis[state0][cur_s1].parity1_64[code_rate], *ptr_output++ = _mm_add_pi8(all_treillis[state0][cur_s1].systematic_64[code_rate],
all_treillis[state1][cur_s2].parity2_64[code_rate])); _mm_add_pi8(all_treillis[state0][cur_s1].parity1_64[code_rate],
all_treillis[state1][cur_s2].parity2_64[code_rate]));
*/
*ptr_output++ = _mm_add_pi8(all_treillis[state0][cur_s1].systematic_andp1_64[code_rate],
all_treillis[state1][cur_s2].parity2_64[code_rate]);
#elif defined(__arm__) #elif defined(__arm__)
uint8x8_t ptmp = vadd_u8(all_treillis[state0][cur_s1].parity1_64[code_rate], *ptr_output++ = vadd_u8(all_treillis[state0][cur_s1].systematic_andp1_64[code_rate],
all_treillis[state1][cur_s2].parity2_64[code_rate]); all_treillis[state0][cur_s1].parity1_64[code_rate]);
*ptr_output++ = vadd_u8(all_treillis[state0][cur_s1].systematic_64[code_rate],
ptmp);
#endif #endif
} }
state0=all_treillis[state0][cur_s1].exit_state; state0=all_treillis[state0][cur_s1].exit_state;
state1=all_treillis[state1][cur_s2].exit_state; state1=all_treillis[state1][cur_s2].exit_state;
} }
x=output+(input_length_bits*3); x=output+(input_length_bits*3);
...@@ -485,7 +654,7 @@ void threegpplte_turbo_encoder(unsigned char *input, ...@@ -485,7 +654,7 @@ void threegpplte_turbo_encoder(unsigned char *input,
int main(int argc,char **argv) int main(int argc,char **argv)
{ {
unsigned char input[INPUT_LENGTH+16],state,state2; unsigned char input[INPUT_LENGTH+32],state,state2;
unsigned char output[12+(3*(INPUT_LENGTH<<3))],x,z; unsigned char output[12+(3*(INPUT_LENGTH<<3))],x,z;
int i; int i;
unsigned char out; unsigned char out;
...@@ -510,7 +679,7 @@ int main(int argc,char **argv) ...@@ -510,7 +679,7 @@ int main(int argc,char **argv)
memset((void*)input,0,INPUT_LENGTH+16); memset((void*)input,0,INPUT_LENGTH+16);
for (i=0; i<INPUT_LENGTH; i++) { for (i=0; i<INPUT_LENGTH; i++) {
input[i] = i*219; input[i] = i*219;
printf("Input %d : %x\n",i,input[i]); printf("Input %d : %d\n",i,input[i]);
} }
threegpplte_turbo_encoder(&input[0], threegpplte_turbo_encoder(&input[0],
......
/*******************************************************************************
OpenAirInterface
Copyright(c) 1999 - 2014 Eurecom
OpenAirInterface is free software: you can redistribute it and/or modify
it under the terms of the GNU General Public License as published by
the Free Software Foundation, either version 3 of the License, or
(at your option) any later version.
OpenAirInterface is distributed in the hope that it will be useful,
but WITHOUT ANY WARRANTY; without even the implied warranty of
MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
GNU General Public License for more details.
You should have received a copy of the GNU General Public License
along with OpenAirInterface.The full GNU General Public License is
included in this distribution in the file called "COPYING". If not,
see <http://www.gnu.org/licenses/>.
Contact Information
OpenAirInterface Admin: openair_admin@eurecom.fr
OpenAirInterface Tech : openair_tech@eurecom.fr
OpenAirInterface Dev : openair4g-devel@lists.eurecom.fr
Address : Eurecom, Campus SophiaTech, 450 Route des Chappes, CS 50193 - 06904 Biot Sophia Antipolis cedex, FRANCE
*******************************************************************************/
/* file: 3gpplte_turbo_decoder_sse_16bit.c
purpose: Routines for implementing max-logmap decoding of Turbo-coded (DLSCH) transport channels from 36-212, V8.6 2009-03
authors: raymond.knopp@eurecom.fr, Laurent Thomas (Alcatel-Lucent)
date: 21.10.2009
Note: This version of the routine currently requires SSE2,SSSE3 and SSE4.1 equipped computers. It uses 16-bit inputs for
LLRS and uses 16-bit arithmetic for the internal computations!
Changelog: 17.11.2009 FK SSE4.1 not required anymore
Aug. 2012 new parallelization options for higher speed (8-way parallelization)
Jan. 2013 8-bit LLR support with 16-way parallelization
Feb. 2013 New interleaving and hard-decision optimizations (L. Thomas)
May 2013 Extracted 16bit code
*/
///
///
#ifdef __AVX2__
#include "PHY/sse_intrin.h"
#ifndef TEST_DEBUG
#include "PHY/defs.h"
#include "PHY/CODING/defs.h"
#include "PHY/CODING/lte_interleaver_inline.h"
#include "extern_3GPPinterleaver.h"
#else
#include "defs.h"
#include <stdio.h>
#include <stdlib.h>
#include <string.h>
#endif
#ifdef MEX
#include "mex.h"
#endif
//#define DEBUG_LOGMAP
#ifdef DEBUG_LOGMAP
#define print_shorts(s,x) fprintf(fdavx2,"%s %d,%d,%d,%d,%d,%d,%d,%d\n",s,(x)[0],(x)[1],(x)[2],(x)[3],(x)[4],(x)[5],(x)[6],(x)[7]);fprintf(fdavx2b,"%s %d,%d,%d,%d,%d,%d,%d,%d\n",s,(x)[8],(x)[9],(x)[10],(x)[11],(x)[12],(x)[13],(x)[14],(x)[15])
FILE *fdavx2,*fdavx2b;
#else
#endif
#define print_bytes(s,x) printf("%s %d,%d,%d,%d,%d,%d,%d,%d,%d,%d,%d,%d,%d,%d,%d,%d,%d,%d,%d,%d,%d,%d,%d,%d,%d,%d,%d,%d,%d,%d,%d,%d\n",s,(x)[0],(x)[1],(x)[2],(x)[3],(x)[4],(x)[5],(x)[6],(x)[7],(x)[8],(x)[9],(x)[10],(x)[11],(x)[12],(x)[13],(x)[14],(x)[15],(x)[16],(x)[17],(x)[18],(x)[19],(x)[20],(x)[21],(x)[22],(x)[23],(x)[24],(x)[25],(x)[26],(x)[27],(x)[28],(x)[29],(x)[30],(x)[31])
typedef int16_t llr_t; // internal decoder LLR data is 16-bit fixed
typedef int16_t channel_t;
#define MAX 256
void log_map16avx2(llr_t* systematic,channel_t* y_parity, llr_t* m11, llr_t* m10, llr_t *alpha, llr_t *beta, llr_t* ext,uint16_t frame_length,unsigned char term_flag,unsigned char F,int offset8_flag,time_stats_t *alpha_stats,time_stats_t *beta_stats,time_stats_t *gamma_stats,time_stats_t *ext_stats);
void compute_gamma16avx2(llr_t* m11,llr_t* m10,llr_t* systematic, channel_t* y_parity, uint16_t frame_length,unsigned char term_flag);
void compute_alpha16avx2(llr_t*alpha,llr_t *beta, llr_t* m11,llr_t* m10, uint16_t frame_length,unsigned char F);
void compute_beta16avx2(llr_t*alpha, llr_t* beta,llr_t* m11,llr_t* m10, uint16_t frame_length,unsigned char F,int offset8_flag);
void compute_ext16avx2(llr_t* alpha,llr_t* beta,llr_t* m11,llr_t* m10,llr_t* extrinsic, llr_t* ap, uint16_t frame_length);
void log_map16avx2(llr_t* systematic,
channel_t* y_parity,
llr_t* m11,
llr_t* m10,
llr_t *alpha,
llr_t *beta,
llr_t* ext,
uint16_t frame_length,
unsigned char term_flag,
unsigned char F,
int offset8_flag,
time_stats_t *alpha_stats,
time_stats_t *beta_stats,
time_stats_t *gamma_stats,
time_stats_t *ext_stats)
{
#ifdef DEBUG_LOGMAP
fprintf(fdavx2,"log_map (avx2_16bit), frame_length %d\n",frame_length);
fprintf(fdavx2b,"log_map (avx2_16bit), frame_length %d\n",frame_length);
#endif
start_meas(gamma_stats) ;
compute_gamma16avx2(m11,m10,systematic,y_parity,frame_length,term_flag) ;
stop_meas(gamma_stats);
start_meas(alpha_stats) ;
compute_alpha16avx2(alpha,beta,m11,m10,frame_length,F) ;
stop_meas(alpha_stats);
start_meas(beta_stats) ;
compute_beta16avx2(alpha,beta,m11,m10,frame_length,F,offset8_flag) ;
stop_meas(beta_stats);
start_meas(ext_stats) ;
compute_ext16avx2(alpha,beta,m11,m10,ext,systematic,frame_length) ;
stop_meas(ext_stats);
}
void compute_gamma16avx2(llr_t* m11,llr_t* m10,llr_t* systematic,channel_t* y_parity,
uint16_t frame_length,unsigned char term_flag)
{
int k,K1;
__m256i *systematic128 = (__m256i *)systematic;
__m256i *y_parity128 = (__m256i *)y_parity;
__m256i *m10_128 = (__m256i *)m10;
__m256i *m11_128 = (__m256i *)m11;
#ifdef DEBUG_LOGMAP
fprintf(fdavx2,"compute_gamma (avx2_16bit), %p,%p,%p,%p,framelength %d\n",m11,m10,systematic,y_parity,frame_length);
fprintf(fdavx2b,"compute_gamma (avx2_16bit), %p,%p,%p,%p,framelength %d\n",m11,m10,systematic,y_parity,frame_length);
#endif
K1=frame_length>>3;
for (k=0; k<K1; k++) {
m11_128[k] = _mm256_srai_epi16(_mm256_adds_epi16(systematic128[k],y_parity128[k]),1);
m10_128[k] = _mm256_srai_epi16(_mm256_subs_epi16(systematic128[k],y_parity128[k]),1);
#ifdef DEBUG_LOGMAP
fprintf(fdavx2,"Loop index k %d\n",k);
fprintf(fdavx2b,"Loop index k %d\n",k);
print_shorts("sys",(int16_t*)&systematic128[k]);
print_shorts("yp",(int16_t*)&y_parity128[k]);
print_shorts("m11",(int16_t*)&m11_128[k]);
print_shorts("m10",(int16_t*)&m10_128[k]);
#endif
}
// Termination
m11_128[k] = _mm256_srai_epi16(_mm256_adds_epi16(systematic128[k+term_flag],y_parity128[k]),1);
m10_128[k] = _mm256_srai_epi16(_mm256_subs_epi16(systematic128[k+term_flag],y_parity128[k]),1);
#ifdef DEBUG_LOGMAP
fprintf(fdavx2,"Loop index k %d (term flag %d)\n",k,term_flag);
fprintf(fdavx2b,"Loop index k %d (term flag %d)\n",k,term_flag);
print_shorts("sys",(int16_t*)&systematic128[k+term_flag]);
print_shorts("yp",(int16_t*)&y_parity128[k]);
print_shorts("m11",(int16_t*)&m11_128[k]);
print_shorts("m10",(int16_t*)&m10_128[k]);
#endif
}
#define L 40
void compute_alpha16avx2(llr_t* alpha,llr_t* beta,llr_t* m_11,llr_t* m_10,uint16_t frame_length,unsigned char F)
{
int k,l,l2,K1,rerun_flag=0;
__m256i *alpha128=(__m256i *)alpha,*alpha_ptr;
__m256i a0,a1,a2,a3,a4,a5,a6,a7,*m11p,*m10p;
__m256i m_b0,m_b1,m_b2,m_b3,m_b4,m_b5,m_b6,m_b7;
__m256i new0,new1,new2,new3,new4,new5,new6,new7;
__m256i alpha_max;
unsigned long long timein,timeout;
l2 = L>>3;
K1 = (frame_length>>3);
#ifdef DEBUG_LOGMAP
fprintf(fdavx2,"Compute alpha (avx2_16bit)\n");
fprintf(fdavx2b,"Compute alpha (avx2_16bit)\n");
#endif
timein = rdtsc_oai();
for (l=K1;; l=l2,rerun_flag=1) {
alpha128 = (__m256i *)alpha;
if (rerun_flag == 0) {
alpha128[0] = _mm256_set_epi16(-MAX/2,-MAX/2,-MAX/2,-MAX/2,-MAX/2,-MAX/2,-MAX/2,0,-MAX/2,-MAX/2,-MAX/2,-MAX/2,-MAX/2,-MAX/2,-MAX/2,0);
alpha128[1] = _mm256_set_epi16(-MAX/2,-MAX/2,-MAX/2,-MAX/2,-MAX/2,-MAX/2,-MAX/2,-MAX/2,-MAX/2,-MAX/2,-MAX/2,-MAX/2,-MAX/2,-MAX/2,-MAX/2,-MAX/2);
alpha128[2] = _mm256_set_epi16(-MAX/2,-MAX/2,-MAX/2,-MAX/2,-MAX/2,-MAX/2,-MAX/2,-MAX/2,-MAX/2,-MAX/2,-MAX/2,-MAX/2,-MAX/2,-MAX/2,-MAX/2,-MAX/2);
alpha128[3] = _mm256_set_epi16(-MAX/2,-MAX/2,-MAX/2,-MAX/2,-MAX/2,-MAX/2,-MAX/2,-MAX/2,-MAX/2,-MAX/2,-MAX/2,-MAX/2,-MAX/2,-MAX/2,-MAX/2,-MAX/2);
alpha128[4] = _mm256_set_epi16(-MAX/2,-MAX/2,-MAX/2,-MAX/2,-MAX/2,-MAX/2,-MAX/2,-MAX/2,-MAX/2,-MAX/2,-MAX/2,-MAX/2,-MAX/2,-MAX/2,-MAX/2,-MAX/2);
alpha128[5] = _mm256_set_epi16(-MAX/2,-MAX/2,-MAX/2,-MAX/2,-MAX/2,-MAX/2,-MAX/2,-MAX/2,-MAX/2,-MAX/2,-MAX/2,-MAX/2,-MAX/2,-MAX/2,-MAX/2,-MAX/2);
alpha128[6] = _mm256_set_epi16(-MAX/2,-MAX/2,-MAX/2,-MAX/2,-MAX/2,-MAX/2,-MAX/2,-MAX/2,-MAX/2,-MAX/2,-MAX/2,-MAX/2,-MAX/2,-MAX/2,-MAX/2,-MAX/2);
alpha128[7] = _mm256_set_epi16(-MAX/2,-MAX/2,-MAX/2,-MAX/2,-MAX/2,-MAX/2,-MAX/2,-MAX/2,-MAX/2,-MAX/2,-MAX/2,-MAX/2,-MAX/2,-MAX/2,-MAX/2,-MAX/2);
#ifdef DEBUG_LOGMAP
fprintf(fdavx2,"Initial alpha\n");
fprintf(fdavx2b,"Initial alpha\n");
print_shorts("a0",(int16_t*)&alpha128[0]);
print_shorts("a1",(int16_t*)&alpha128[1]);
print_shorts("a2",(int16_t*)&alpha128[2]);
print_shorts("a3",(int16_t*)&alpha128[3]);
print_shorts("a4",(int16_t*)&alpha128[4]);
print_shorts("a5",(int16_t*)&alpha128[5]);
print_shorts("a6",(int16_t*)&alpha128[6]);
print_shorts("a7",(int16_t*)&alpha128[7]);
#endif
} else {
//set initial alpha in columns 1-7 from final alpha from last run in columns 0-6
alpha128[0] = _mm256_slli_si256(alpha128[frame_length],2);
alpha128[1] = _mm256_slli_si256(alpha128[1+frame_length],2);
alpha128[2] = _mm256_slli_si256(alpha128[2+frame_length],2);
alpha128[3] = _mm256_slli_si256(alpha128[3+frame_length],2);
alpha128[4] = _mm256_slli_si256(alpha128[4+frame_length],2);
alpha128[5] = _mm256_slli_si256(alpha128[5+frame_length],2);
alpha128[6] = _mm256_slli_si256(alpha128[6+frame_length],2);
alpha128[7] = _mm256_slli_si256(alpha128[7+frame_length],2);
// set initial alpha in column 0 to (0,-MAX/2,...,-MAX/2)
alpha[16] = -MAX/2;
alpha[32] = -MAX/2;
alpha[48] = -MAX/2;
alpha[64] = -MAX/2;
alpha[80] = -MAX/2;
alpha[96] = -MAX/2;
alpha[112] = -MAX/2;
alpha[24] = -MAX/2;
alpha[40] = -MAX/2;
alpha[56] = -MAX/2;
alpha[72] = -MAX/2;
alpha[88] = -MAX/2;
alpha[104] = -MAX/2;
alpha[120] = -MAX/2;
#ifdef DEBUG_LOGMAP
fprintf(fdavx2,"Second run\n");
fprintf(fdavx2b,"Second run\n");
print_shorts("a0",(int16_t*)&alpha128[0]);
print_shorts("a1",(int16_t*)&alpha128[1]);
print_shorts("a2",(int16_t*)&alpha128[2]);
print_shorts("a3",(int16_t*)&alpha128[3]);
print_shorts("a4",(int16_t*)&alpha128[4]);
print_shorts("a5",(int16_t*)&alpha128[5]);
print_shorts("a6",(int16_t*)&alpha128[6]);
print_shorts("a7",(int16_t*)&alpha128[7]);
#endif
}
alpha_ptr = &alpha128[0];
m11p = (__m256i*)m_11;
m10p = (__m256i*)m_10;
for (k=0;
k<l;
k++) {
a1=_mm256_load_si256(&alpha_ptr[1]);
a3=_mm256_load_si256(&alpha_ptr[3]);
a5=_mm256_load_si256(&alpha_ptr[5]);
a7=_mm256_load_si256(&alpha_ptr[7]);
m_b0 = _mm256_adds_epi16(a1,*m11p); // m11
m_b4 = _mm256_subs_epi16(a1,*m11p); // m00=-m11
m_b1 = _mm256_subs_epi16(a3,*m10p); // m01=-m10
m_b5 = _mm256_adds_epi16(a3,*m10p); // m10
m_b2 = _mm256_adds_epi16(a5,*m10p); // m10
m_b6 = _mm256_subs_epi16(a5,*m10p); // m01=-m10
m_b3 = _mm256_subs_epi16(a7,*m11p); // m00=-m11
m_b7 = _mm256_adds_epi16(a7,*m11p); // m11
a0=_mm256_load_si256(&alpha_ptr[0]);
a2=_mm256_load_si256(&alpha_ptr[2]);
a4=_mm256_load_si256(&alpha_ptr[4]);
a6=_mm256_load_si256(&alpha_ptr[6]);
new0 = _mm256_subs_epi16(a0,*m11p); // m00=-m11
new4 = _mm256_adds_epi16(a0,*m11p); // m11
new1 = _mm256_adds_epi16(a2,*m10p); // m10
new5 = _mm256_subs_epi16(a2,*m10p); // m01=-m10
new2 = _mm256_subs_epi16(a4,*m10p); // m01=-m10
new6 = _mm256_adds_epi16(a4,*m10p); // m10
new3 = _mm256_adds_epi16(a6,*m11p); // m11
new7 = _mm256_subs_epi16(a6,*m11p); // m00=-m11
a0 = _mm256_max_epi16(m_b0,new0);
a1 = _mm256_max_epi16(m_b1,new1);
a2 = _mm256_max_epi16(m_b2,new2);
a3 = _mm256_max_epi16(m_b3,new3);
a4 = _mm256_max_epi16(m_b4,new4);
a5 = _mm256_max_epi16(m_b5,new5);
a6 = _mm256_max_epi16(m_b6,new6);
a7 = _mm256_max_epi16(m_b7,new7);
alpha_max = _mm256_max_epi16(a0,a1);
alpha_max = _mm256_max_epi16(alpha_max,a2);
alpha_max = _mm256_max_epi16(alpha_max,a3);
alpha_max = _mm256_max_epi16(alpha_max,a4);
alpha_max = _mm256_max_epi16(alpha_max,a5);
alpha_max = _mm256_max_epi16(alpha_max,a6);
alpha_max = _mm256_max_epi16(alpha_max,a7);
alpha_ptr+=8;
m11p++;
m10p++;
alpha_ptr[0] = _mm256_subs_epi16(a0,alpha_max);
alpha_ptr[1] = _mm256_subs_epi16(a1,alpha_max);
alpha_ptr[2] = _mm256_subs_epi16(a2,alpha_max);
alpha_ptr[3] = _mm256_subs_epi16(a3,alpha_max);
alpha_ptr[4] = _mm256_subs_epi16(a4,alpha_max);
alpha_ptr[5] = _mm256_subs_epi16(a5,alpha_max);
alpha_ptr[6] = _mm256_subs_epi16(a6,alpha_max);
alpha_ptr[7] = _mm256_subs_epi16(a7,alpha_max);
#ifdef DEBUG_LOGMAP
fprintf(fdavx2,"Loop index %d\n",k);
fprintf(fdavx2b,"Loop index %d\n",k);
print_shorts("mb0",(int16_t*)&m_b0);
print_shorts("mb1",(int16_t*)&m_b1);
print_shorts("mb2",(int16_t*)&m_b2);
print_shorts("mb3",(int16_t*)&m_b3);
print_shorts("mb4",(int16_t*)&m_b4);
print_shorts("mb5",(int16_t*)&m_b5);
print_shorts("mb6",(int16_t*)&m_b6);
print_shorts("mb7",(int16_t*)&m_b7);
fprintf(fdavx2,"Loop index %d, new\n",k);
fprintf(fdavx2b,"Loop index %d, new\n",k);
print_shorts("new0",(int16_t*)&new0);
print_shorts("new1",(int16_t*)&new1);
print_shorts("new2",(int16_t*)&new2);
print_shorts("new3",(int16_t*)&new3);
print_shorts("new4",(int16_t*)&new4);
print_shorts("new5",(int16_t*)&new5);
print_shorts("new6",(int16_t*)&new6);
print_shorts("new7",(int16_t*)&new7);
fprintf(fdavx2,"Loop index %d, after max\n",k);
fprintf(fdavx2b,"Loop index %d, after max\n",k);
print_shorts("a0",(int16_t*)&a0);
print_shorts("a1",(int16_t*)&a1);
print_shorts("a2",(int16_t*)&a2);
print_shorts("a3",(int16_t*)&a3);
print_shorts("a4",(int16_t*)&a4);
print_shorts("a5",(int16_t*)&a5);
print_shorts("a6",(int16_t*)&a6);
print_shorts("a7",(int16_t*)&a7);
fprintf(fdavx2,"Loop index %d\n",k);
fprintf(fdavx2b,"Loop index %d\n",k);
print_shorts("a0",(int16_t*)&alpha_ptr[0]);
print_shorts("a1",(int16_t*)&alpha_ptr[1]);
print_shorts("a2",(int16_t*)&alpha_ptr[2]);
print_shorts("a3",(int16_t*)&alpha_ptr[3]);
print_shorts("a4",(int16_t*)&alpha_ptr[4]);
print_shorts("a5",(int16_t*)&alpha_ptr[5]);
print_shorts("a6",(int16_t*)&alpha_ptr[6]);
print_shorts("a7",(int16_t*)&alpha_ptr[7]);
#endif
}
if (rerun_flag==1)
break;
}
timeout = rdtsc_oai();
printf("alpha: inner loop time %llu\n",timeout-timein);
}
void compute_beta16avx2(llr_t* alpha,llr_t* beta,llr_t *m_11,llr_t* m_10,uint16_t frame_length,unsigned char F,int offset8_flag)
{
int k,rerun_flag=0;
__m256i *m11p,*m10p;
register __m256i b0,b1,b2,b3,b4,b5,b6,b7;
register __m256i m_b0,m_b1,m_b2,m_b3,m_b4,m_b5,m_b6,m_b7;
register __m256i new0,new1,new2,new3,new4,new5,new6,new7;
__m256i *beta128,*alpha128,*beta_ptr;
__m256i beta_max;
llr_t m11,m10,beta0_16,beta1_16,beta2_16,beta3_16,beta4_16,beta5_16,beta6_16,beta7_16,beta0_2,beta1_2,beta2_2,beta3_2,beta_m;
llr_t m11_cw2,m10_cw2,beta0_cw2_16,beta1_cw2_16,beta2_cw2_16,beta3_cw2_16,beta4_cw2_16,beta5_cw2_16,beta6_cw2_16,beta7_cw2_16,beta0_2_cw2,beta1_2_cw2,beta2_2_cw2,beta3_2_cw2,beta_m_cw2;
llr_t beta0,beta1;
llr_t beta0_cw2,beta1_cw2;
unsigned long long timein,timeout;
#ifdef DEBUG_LOGMAP
fprintf(fdavx2,"compute_beta (avx2_16bit), %p,%p,%p,%p,framelength %d,F %d\n",
beta,m_11,m_10,alpha,frame_length,F);
fprintf(fdavx2b,"compute_beta (avx2_16bit), %p,%p,%p,%p,framelength %d,F %d\n",
beta,m_11,m_10,alpha,frame_length,F);
#endif
// termination for beta initialization
// fprintf(fdavx2,"beta init: offset8 %d\n",offset8_flag);
m11=(int16_t)m_11[(frame_length<<1)+2];
m10=(int16_t)m_10[(frame_length<<1)+2];
m11_cw2=(int16_t)m_11[(frame_length<<1)+8+2];
m10_cw2=(int16_t)m_10[(frame_length<<1)+8+2];
#ifdef DEBUG_LOGMAP
fprintf(fdavx2,"m11,m10 %d,%d\n",m11,m10);
fprintf(fdavx2b,"m11,m10 %d,%d\n",m11_cw2,m10_cw2);
#endif
beta0 = -m11;//M0T_TERM;
beta1 = m11;//M1T_TERM;
beta0_cw2 = -m11_cw2;//M0T_TERM;
beta1_cw2 = m11_cw2;//M1T_TERM;
m11=(int16_t)m_11[(frame_length<<1)+1];
m10=(int16_t)m_10[(frame_length<<1)+1];
m11_cw2=(int16_t)m_11[(frame_length<<1)+1+8];
m10_cw2=(int16_t)m_10[(frame_length<<1)+1+8];
#ifdef DEBUG_LOGMAP
fprintf(fdavx2,"m11,m10 %d,%d\n",m11,m10);
fprintf(fdavx2b,"m11,m10 %d,%d\n",m11_cw2,m10_cw2);
#endif
beta0_2 = beta0-m11;//+M0T_TERM;
beta1_2 = beta0+m11;//+M1T_TERM;
beta2_2 = beta1+m10;//M2T_TERM;
beta3_2 = beta1-m10;//+M3T_TERM;
beta0_2_cw2 = beta0_cw2-m11_cw2;//+M0T_TERM;
beta1_2_cw2 = beta0_cw2+m11_cw2;//+M1T_TERM;
beta2_2_cw2 = beta1_cw2+m10_cw2;//M2T_TERM;
beta3_2_cw2 = beta1_cw2-m10_cw2;//+M3T_TERM;
m11=(int16_t)m_11[frame_length<<1];
m10=(int16_t)m_10[frame_length<<1];
m11_cw2=(int16_t)m_11[(frame_length<<1)+8];
m10_cw2=(int16_t)m_10[(frame_length<<1)+8];
#ifdef DEBUG_LOGMAP
fprintf(fdavx2,"m11,m10 %d,%d\n",m11,m10);
fprintf(fdavx2b,"m11,m10 %d,%d\n",m11_cw2,m10_cw2);
#endif
beta0_16 = beta0_2-m11;//+M0T_TERM;
beta1_16 = beta0_2+m11;//+M1T_TERM;
beta2_16 = beta1_2+m10;//+M2T_TERM;
beta3_16 = beta1_2-m10;//+M3T_TERM;
beta4_16 = beta2_2-m10;//+M4T_TERM;
beta5_16 = beta2_2+m10;//+M5T_TERM;
beta6_16 = beta3_2+m11;//+M6T_TERM;
beta7_16 = beta3_2-m11;//+M7T_TERM;
beta0_cw2_16 = beta0_2_cw2-m11_cw2;//+M0T_TERM;
beta1_cw2_16 = beta0_2_cw2+m11_cw2;//+M1T_TERM;
beta2_cw2_16 = beta1_2_cw2+m10_cw2;//+M2T_TERM;
beta3_cw2_16 = beta1_2_cw2-m10_cw2;//+M3T_TERM;
beta4_cw2_16 = beta2_2_cw2-m10_cw2;//+M4T_TERM;
beta5_cw2_16 = beta2_2_cw2+m10_cw2;//+M5T_TERM;
beta6_cw2_16 = beta3_2_cw2+m11_cw2;//+M6T_TERM;
beta7_cw2_16 = beta3_2_cw2-m11_cw2;//+M7T_TERM;
beta_m = (beta0_16>beta1_16) ? beta0_16 : beta1_16;
beta_m = (beta_m>beta2_16) ? beta_m : beta2_16;
beta_m = (beta_m>beta3_16) ? beta_m : beta3_16;
beta_m = (beta_m>beta4_16) ? beta_m : beta4_16;
beta_m = (beta_m>beta5_16) ? beta_m : beta5_16;
beta_m = (beta_m>beta6_16) ? beta_m : beta6_16;
beta_m = (beta_m>beta7_16) ? beta_m : beta7_16;
beta_m_cw2 = (beta0_cw2_16>beta1_cw2_16) ? beta0_cw2_16 : beta1_cw2_16;
beta_m_cw2 = (beta_m_cw2>beta2_cw2_16) ? beta_m_cw2 : beta2_cw2_16;
beta_m_cw2 = (beta_m_cw2>beta3_cw2_16) ? beta_m_cw2 : beta3_cw2_16;
beta_m_cw2 = (beta_m_cw2>beta4_cw2_16) ? beta_m_cw2 : beta4_cw2_16;
beta_m_cw2 = (beta_m_cw2>beta5_cw2_16) ? beta_m_cw2 : beta5_cw2_16;
beta_m_cw2 = (beta_m_cw2>beta6_cw2_16) ? beta_m_cw2 : beta6_cw2_16;
beta_m_cw2 = (beta_m_cw2>beta7_cw2_16) ? beta_m_cw2 : beta7_cw2_16;
beta0_16=beta0_16-beta_m;
beta1_16=beta1_16-beta_m;
beta2_16=beta2_16-beta_m;
beta3_16=beta3_16-beta_m;
beta4_16=beta4_16-beta_m;
beta5_16=beta5_16-beta_m;
beta6_16=beta6_16-beta_m;
beta7_16=beta7_16-beta_m;
beta0_cw2_16=beta0_cw2_16-beta_m_cw2;
beta1_cw2_16=beta1_cw2_16-beta_m_cw2;
beta2_cw2_16=beta2_cw2_16-beta_m_cw2;
beta3_cw2_16=beta3_cw2_16-beta_m_cw2;
beta4_cw2_16=beta4_cw2_16-beta_m_cw2;
beta5_cw2_16=beta5_cw2_16-beta_m_cw2;
beta6_cw2_16=beta6_cw2_16-beta_m_cw2;
beta7_cw2_16=beta7_cw2_16-beta_m_cw2;
for (rerun_flag=0;; rerun_flag=1) {
beta_ptr = (__m256i*)&beta[frame_length<<4];
alpha128 = (__m256i*)&alpha[0];
if (rerun_flag == 0) {
beta_ptr[0] = alpha128[(frame_length)];
beta_ptr[1] = alpha128[1+(frame_length)];
beta_ptr[2] = alpha128[2+(frame_length)];
beta_ptr[3] = alpha128[3+(frame_length)];
beta_ptr[4] = alpha128[4+(frame_length)];
beta_ptr[5] = alpha128[5+(frame_length)];
beta_ptr[6] = alpha128[6+(frame_length)];
beta_ptr[7] = alpha128[7+(frame_length)];
#ifdef DEBUG_LOGMAP
fprintf(fdavx2,"beta init \n");
fprintf(fdavx2b,"beta init \n");
print_shorts("b0",(int16_t*)&beta_ptr[0]);
print_shorts("b1",(int16_t*)&beta_ptr[1]);
print_shorts("b2",(int16_t*)&beta_ptr[2]);
print_shorts("b3",(int16_t*)&beta_ptr[3]);
print_shorts("b4",(int16_t*)&beta_ptr[4]);
print_shorts("b5",(int16_t*)&beta_ptr[5]);
print_shorts("b6",(int16_t*)&beta_ptr[6]);
print_shorts("b7",(int16_t*)&beta_ptr[7]);
#endif
} else {
beta128 = (__m256i*)&beta[0];
beta_ptr[0] = _mm256_srli_si256(beta128[0],2);
beta_ptr[1] = _mm256_srli_si256(beta128[1],2);
beta_ptr[2] = _mm256_srli_si256(beta128[2],2);
beta_ptr[3] = _mm256_srli_si256(beta128[3],2);
beta_ptr[4] = _mm256_srli_si256(beta128[4],2);
beta_ptr[5] = _mm256_srli_si256(beta128[5],2);
beta_ptr[6] = _mm256_srli_si256(beta128[6],2);
beta_ptr[7] = _mm256_srli_si256(beta128[7],2);
#ifdef DEBUG_LOGMAP
fprintf(fdavx2,"beta init (second run)\n");
fprintf(fdavx2b,"beta init (second run)\n");
print_shorts("b0",(int16_t*)&beta_ptr[0]);
print_shorts("b1",(int16_t*)&beta_ptr[1]);
print_shorts("b2",(int16_t*)&beta_ptr[2]);
print_shorts("b3",(int16_t*)&beta_ptr[3]);
print_shorts("b4",(int16_t*)&beta_ptr[4]);
print_shorts("b5",(int16_t*)&beta_ptr[5]);
print_shorts("b6",(int16_t*)&beta_ptr[6]);
print_shorts("b7",(int16_t*)&beta_ptr[7]);
#endif
}
beta_ptr[0] = _mm256_insert_epi16(beta_ptr[0],beta0_16,7);
beta_ptr[1] = _mm256_insert_epi16(beta_ptr[1],beta1_16,7);
beta_ptr[2] = _mm256_insert_epi16(beta_ptr[2],beta2_16,7);
beta_ptr[3] = _mm256_insert_epi16(beta_ptr[3],beta3_16,7);
beta_ptr[4] = _mm256_insert_epi16(beta_ptr[4],beta4_16,7);
beta_ptr[5] = _mm256_insert_epi16(beta_ptr[5],beta5_16,7);
beta_ptr[6] = _mm256_insert_epi16(beta_ptr[6],beta6_16,7);
beta_ptr[7] = _mm256_insert_epi16(beta_ptr[7],beta7_16,7);
beta_ptr[0] = _mm256_insert_epi16(beta_ptr[0],beta0_cw2_16,15);
beta_ptr[1] = _mm256_insert_epi16(beta_ptr[1],beta1_cw2_16,15);
beta_ptr[2] = _mm256_insert_epi16(beta_ptr[2],beta2_cw2_16,15);
beta_ptr[3] = _mm256_insert_epi16(beta_ptr[3],beta3_cw2_16,15);
beta_ptr[4] = _mm256_insert_epi16(beta_ptr[4],beta4_cw2_16,15);
beta_ptr[5] = _mm256_insert_epi16(beta_ptr[5],beta5_cw2_16,15);
beta_ptr[6] = _mm256_insert_epi16(beta_ptr[6],beta6_cw2_16,15);
beta_ptr[7] = _mm256_insert_epi16(beta_ptr[7],beta7_cw2_16,15);
#ifdef DEBUG_LOGMAP
fprintf(fdavx2,"beta init (after insert) \n");
fprintf(fdavx2b,"beta init (after insert) \n");
print_shorts("b0",(int16_t*)&beta_ptr[0]);
print_shorts("b1",(int16_t*)&beta_ptr[1]);
print_shorts("b2",(int16_t*)&beta_ptr[2]);
print_shorts("b3",(int16_t*)&beta_ptr[3]);
print_shorts("b4",(int16_t*)&beta_ptr[4]);
print_shorts("b5",(int16_t*)&beta_ptr[5]);
print_shorts("b6",(int16_t*)&beta_ptr[6]);
print_shorts("b7",(int16_t*)&beta_ptr[7]);
#endif
int loopval=((rerun_flag==0)?0:((frame_length-L)>>3));
printf("beta: rerun %d => loopval %d\n",rerun_flag,loopval);
timein = rdtsc_oai();
m11p = (frame_length>>3)-1+(__m256i*)m_11;
m10p = (frame_length>>3)-1+(__m256i*)m_10;
for (k=(frame_length>>3)-1; k>=loopval; k--) {
b4 = _mm256_load_si256(&beta_ptr[4]);
b5 = _mm256_load_si256(&beta_ptr[5]);
b6 = _mm256_load_si256(&beta_ptr[6]);
b7 = _mm256_load_si256(&beta_ptr[7]);
m_b0 = _mm256_adds_epi16(b4,*m11p); //m11
m_b1 = _mm256_subs_epi16(b4,*m11p); //m00
m_b2 = _mm256_subs_epi16(b5,*m10p); //m01
m_b3 = _mm256_adds_epi16(b5,*m10p); //m10
m_b4 = _mm256_adds_epi16(b6,*m10p); //m10
m_b5 = _mm256_subs_epi16(b6,*m10p); //m01
m_b6 = _mm256_subs_epi16(b7,*m11p); //m00
m_b7 = _mm256_adds_epi16(b7,*m11p); //m11
b0 = _mm256_load_si256(&beta_ptr[0]);
b1 = _mm256_load_si256(&beta_ptr[1]);
b2 = _mm256_load_si256(&beta_ptr[2]);
b3 = _mm256_load_si256(&beta_ptr[3]);
new0 = _mm256_subs_epi16(b0,*m11p); //m00
new1 = _mm256_adds_epi16(b0,*m11p); //m11
new2 = _mm256_adds_epi16(b1,*m10p); //m10
new3 = _mm256_subs_epi16(b1,*m10p); //m01
new4 = _mm256_subs_epi16(b2,*m10p); //m01
new5 = _mm256_adds_epi16(b2,*m10p); //m10
new6 = _mm256_adds_epi16(b3,*m11p); //m11
new7 = _mm256_subs_epi16(b3,*m11p); //m00
b0 = _mm256_max_epi16(m_b0,new0);
b1 = _mm256_max_epi16(m_b1,new1);
b2 = _mm256_max_epi16(m_b2,new2);
b3 = _mm256_max_epi16(m_b3,new3);
b4 = _mm256_max_epi16(m_b4,new4);
b5 = _mm256_max_epi16(m_b5,new5);
b6 = _mm256_max_epi16(m_b6,new6);
b7 = _mm256_max_epi16(m_b7,new7);
beta_max = _mm256_max_epi16(b0,b1);
beta_max = _mm256_max_epi16(beta_max ,b2);
beta_max = _mm256_max_epi16(beta_max ,b3);
beta_max = _mm256_max_epi16(beta_max ,b4);
beta_max = _mm256_max_epi16(beta_max ,b5);
beta_max = _mm256_max_epi16(beta_max ,b6);
beta_max = _mm256_max_epi16(beta_max ,b7);
beta_ptr-=8;
m11p--;
m10p--;
beta_ptr[0] = _mm256_subs_epi16(b0,beta_max);
beta_ptr[1] = _mm256_subs_epi16(b1,beta_max);
beta_ptr[2] = _mm256_subs_epi16(b2,beta_max);
beta_ptr[3] = _mm256_subs_epi16(b3,beta_max);
beta_ptr[4] = _mm256_subs_epi16(b4,beta_max);
beta_ptr[5] = _mm256_subs_epi16(b5,beta_max);
beta_ptr[6] = _mm256_subs_epi16(b6,beta_max);
beta_ptr[7] = _mm256_subs_epi16(b7,beta_max);
#ifdef DEBUG_LOGMAP
fprintf(fdavx2,"Loop index %d, mb\n",k);
fprintf(fdavx2,"beta init (after max)\n");
fprintf(fdavx2b,"Loop index %d, mb\n",k);
fprintf(fdavx2b,"beta init (after max)\n");
print_shorts("b0",(int16_t*)&beta_ptr[0]);
print_shorts("b1",(int16_t*)&beta_ptr[1]);
print_shorts("b2",(int16_t*)&beta_ptr[2]);
print_shorts("b3",(int16_t*)&beta_ptr[3]);
print_shorts("b4",(int16_t*)&beta_ptr[4]);
print_shorts("b5",(int16_t*)&beta_ptr[5]);
print_shorts("b6",(int16_t*)&beta_ptr[6]);
print_shorts("b7",(int16_t*)&beta_ptr[7]);
#endif
}
timeout = rdtsc_oai();
printf("beta: inner loop time %llu\n",timeout-timein);
if (rerun_flag==1)
break;
}
}
void compute_ext16avx2(llr_t* alpha,llr_t* beta,llr_t* m_11,llr_t* m_10,llr_t* ext, llr_t* systematic,uint16_t frame_length)
{
__m256i *alpha128=(__m256i *)alpha;
__m256i *beta128=(__m256i *)beta;
__m256i *m11_128,*m10_128,*ext_128;
__m256i *alpha_ptr,*beta_ptr;
__m256i m00_1,m00_2,m00_3,m00_4;
__m256i m01_1,m01_2,m01_3,m01_4;
__m256i m10_1,m10_2,m10_3,m10_4;
__m256i m11_1,m11_2,m11_3,m11_4;
int k;
//
// LLR computation, 8 consequtive bits per loop
//
#ifdef DEBUG_LOGMAP
fprintf(fdavx2,"compute_ext (avx2_16bit), %p, %p, %p, %p, %p, %p ,framelength %d\n",alpha,beta,m_11,m_10,ext,systematic,frame_length);
fprintf(fdavx2b,"compute_ext (avx2_16bit), %p, %p, %p, %p, %p, %p ,framelength %d\n",alpha,beta,m_11,m_10,ext,systematic,frame_length);
#endif
alpha_ptr = alpha128;
beta_ptr = &beta128[8];
for (k=0; k<(frame_length>>3); k++) {
m11_128 = (__m256i*)&m_11[k<<4];
m10_128 = (__m256i*)&m_10[k<<4];
ext_128 = (__m256i*)&ext[k<<4];
/*
fprintf(fdavx2,"EXT %03d\n",k);
print_shorts("a0:",&alpha_ptr[0]);
print_shorts("a1:",&alpha_ptr[1]);
print_shorts("a2:",&alpha_ptr[2]);
print_shorts("a3:",&alpha_ptr[3]);
print_shorts("a4:",&alpha_ptr[4]);
print_shorts("a5:",&alpha_ptr[5]);
print_shorts("a6:",&alpha_ptr[6]);
print_shorts("a7:",&alpha_ptr[7]);
print_shorts("b0:",&beta_ptr[0]);
print_shorts("b1:",&beta_ptr[1]);
print_shorts("b2:",&beta_ptr[2]);
print_shorts("b3:",&beta_ptr[3]);
print_shorts("b4:",&beta_ptr[4]);
print_shorts("b5:",&beta_ptr[5]);
print_shorts("b6:",&beta_ptr[6]);
print_shorts("b7:",&beta_ptr[7]);
*/
m00_4 = _mm256_adds_epi16(alpha_ptr[7],beta_ptr[3]); //ALPHA_BETA_4m00;
m11_4 = _mm256_adds_epi16(alpha_ptr[7],beta_ptr[7]); //ALPHA_BETA_4m11;
m00_3 = _mm256_adds_epi16(alpha_ptr[6],beta_ptr[7]); //ALPHA_BETA_3m00;
m11_3 = _mm256_adds_epi16(alpha_ptr[6],beta_ptr[3]); //ALPHA_BETA_3m11;
m00_2 = _mm256_adds_epi16(alpha_ptr[1],beta_ptr[4]); //ALPHA_BETA_2m00;
m11_2 = _mm256_adds_epi16(alpha_ptr[1],beta_ptr[0]); //ALPHA_BETA_2m11;
m11_1 = _mm256_adds_epi16(alpha_ptr[0],beta_ptr[4]); //ALPHA_BETA_1m11;
m00_1 = _mm256_adds_epi16(alpha_ptr[0],beta_ptr[0]); //ALPHA_BETA_1m00;
m01_4 = _mm256_adds_epi16(alpha_ptr[5],beta_ptr[6]); //ALPHA_BETA_4m01;
m10_4 = _mm256_adds_epi16(alpha_ptr[5],beta_ptr[2]); //ALPHA_BETA_4m10;
m01_3 = _mm256_adds_epi16(alpha_ptr[4],beta_ptr[2]); //ALPHA_BETA_3m01;
m10_3 = _mm256_adds_epi16(alpha_ptr[4],beta_ptr[6]); //ALPHA_BETA_3m10;
m01_2 = _mm256_adds_epi16(alpha_ptr[3],beta_ptr[1]); //ALPHA_BETA_2m01;
m10_2 = _mm256_adds_epi16(alpha_ptr[3],beta_ptr[5]); //ALPHA_BETA_2m10;
m10_1 = _mm256_adds_epi16(alpha_ptr[2],beta_ptr[1]); //ALPHA_BETA_1m10;
m01_1 = _mm256_adds_epi16(alpha_ptr[2],beta_ptr[5]); //ALPHA_BETA_1m01;
/*
print_shorts("m11_1:",&m11_1);
print_shorts("m11_2:",&m11_2);
print_shorts("m11_3:",&m11_3);
print_shorts("m11_4:",&m11_4);
print_shorts("m00_1:",&m00_1);
print_shorts("m00_2:",&m00_2);
print_shorts("m00_3:",&m00_3);
print_shorts("m00_4:",&m00_4);
print_shorts("m10_1:",&m10_1);
print_shorts("m10_2:",&m10_2);
print_shorts("m10_3:",&m10_3);
print_shorts("m10_4:",&m10_4);
print_shorts("m01_1:",&m01_1);
print_shorts("m01_2:",&m01_2);
print_shorts("m01_3:",&m01_3);
print_shorts("m01_4:",&m01_4);
*/
m01_1 = _mm256_max_epi16(m01_1,m01_2);
m01_1 = _mm256_max_epi16(m01_1,m01_3);
m01_1 = _mm256_max_epi16(m01_1,m01_4);
m00_1 = _mm256_max_epi16(m00_1,m00_2);
m00_1 = _mm256_max_epi16(m00_1,m00_3);
m00_1 = _mm256_max_epi16(m00_1,m00_4);
m10_1 = _mm256_max_epi16(m10_1,m10_2);
m10_1 = _mm256_max_epi16(m10_1,m10_3);
m10_1 = _mm256_max_epi16(m10_1,m10_4);
m11_1 = _mm256_max_epi16(m11_1,m11_2);
m11_1 = _mm256_max_epi16(m11_1,m11_3);
m11_1 = _mm256_max_epi16(m11_1,m11_4);
// print_shorts("m11_1:",&m11_1);
m01_1 = _mm256_subs_epi16(m01_1,*m10_128);
m00_1 = _mm256_subs_epi16(m00_1,*m11_128);
m10_1 = _mm256_adds_epi16(m10_1,*m10_128);
m11_1 = _mm256_adds_epi16(m11_1,*m11_128);
// print_shorts("m10_1:",&m10_1);
// print_shorts("m11_1:",&m11_1);
m01_1 = _mm256_max_epi16(m01_1,m00_1);
m10_1 = _mm256_max_epi16(m10_1,m11_1);
// print_shorts("m01_1:",&m01_1);
// print_shorts("m10_1:",&m10_1);
*ext_128 = _mm256_subs_epi16(m10_1,m01_1);
#ifdef DEBUG_LOGMAP
fprintf(fdavx2,"ext %p\n",ext_128);
fprintf(fdavx2b,"ext %p\n",ext_128);
print_shorts("ext:",(int16_t*)ext_128);
print_shorts("m11:",(int16_t*)m11_128);
print_shorts("m10:",(int16_t*)m10_128);
print_shorts("m10_1:",(int16_t*)&m10_1);
print_shorts("m01_1:",(int16_t*)&m01_1);
#endif
alpha_ptr+=8;
beta_ptr+=8;
}
}
//int pi2[n],pi3[n+8],pi5[n+8],pi4[n+8],pi6[n+8],
int *pi2tab16avx2[188],*pi5tab16avx2[188],*pi4tab16avx2[188],*pi6tab16avx2[188];
void free_td16avx2(void)
{
int ind;
for (ind=0; ind<188; ind++) {
free(pi2tab16avx2[ind]);
free(pi5tab16avx2[ind]);
free(pi4tab16avx2[ind]);
free(pi6tab16avx2[ind]);
}
}
void init_td16avx2()
{
int ind,i,i2,i3,j,n,pi,pi2_i,pi2_pi;
short * base_interleaver;
for (ind=0; ind<188; ind++) {
n = f1f2mat[ind].nb_bits;
base_interleaver=il_tb+f1f2mat[ind].beg_index;
#ifdef MEX
// This is needed for the Mex implementation to make the memory persistent
pi2tab16[ind] = mxMalloc((n+8)*sizeof(int));
pi5tab16[ind] = mxMalloc((n+8)*sizeof(int));
pi4tab16[ind] = mxMalloc((n+8)*sizeof(int));
pi6tab16[ind] = mxMalloc((n+8)*sizeof(int));
#else
pi2tab16avx2[ind] = malloc((n+8)*sizeof(int));
pi5tab16avx2[ind] = malloc((n+8)*sizeof(int));
pi4tab16avx2[ind] = malloc((n+8)*sizeof(int));
pi6tab16avx2[ind] = malloc((n+8)*sizeof(int));
#endif
// fprintf(fdavx2,"Interleaver index %d\n",ind);
for (i=i2=0; i2<8; i2++) {
j=i2;
for (i3=0; i3<(n>>3); i3++,i++,j+=8) {
// if (j>=n)
// j-=(n-1);
pi2tab16avx2[ind][i] = ((j>>3)<<4) + (j&7); // 16*floor(j/8) + j mod8, which allows the second codeword to be in pi[i] + 8
// fprintf(fdavx2,"pi2[%d] = %d(%d)\n",i, pi2tab16avx2[ind][i],j);
}
}
for (i=0; i<n; i++) {
pi = base_interleaver[i];//(uint32_t)threegpplte_interleaver(f1,f2,n);
pi2_i = ((pi2tab16avx2[ind][i]>>4)<<3)+(pi2tab16avx2[ind][i]&7);
pi2_pi = ((pi2tab16avx2[ind][pi]>>4)<<3)+(pi2tab16avx2[ind][pi]&7);
pi4tab16avx2[ind][pi2_i] = pi2tab16avx2[ind][pi];
pi5tab16avx2[ind][pi2_pi] = pi2tab16avx2[ind][i];
pi6tab16avx2[ind][pi] = pi2tab16avx2[ind][i];
}
}
}
unsigned char phy_threegpplte_turbo_decoder16avx2(int16_t *y,
int16_t *y2,
uint8_t *decoded_bytes,
uint8_t *decoded_bytes2,
uint16_t n,
uint16_t f1,
uint16_t f2,
uint8_t max_iterations,
uint8_t crc_type,
uint8_t F,
time_stats_t *init_stats,
time_stats_t *alpha_stats,
time_stats_t *beta_stats,
time_stats_t *gamma_stats,
time_stats_t *ext_stats,
time_stats_t *intl1_stats,
time_stats_t *intl2_stats)
{
/* y is a pointer to the input
decoded_bytes is a pointer to the decoded output
n is the size in bits of the coded block, with the tail */
llr_t systematic0[2*(n+16)] __attribute__ ((aligned(32)));
llr_t systematic1[2*(n+16)] __attribute__ ((aligned(32)));
llr_t systematic2[2*(n+16)] __attribute__ ((aligned(32)));
llr_t yparity1[2*(n+16)] __attribute__ ((aligned(32)));
llr_t yparity2[2*(n+16)] __attribute__ ((aligned(32)));
llr_t ext[2*(n+128)] __attribute__((aligned(32)));
llr_t ext2[2*(n+128)] __attribute__((aligned(32)));
llr_t alpha[(n+16)*16] __attribute__ ((aligned(32)));
llr_t beta[(n+16)*16] __attribute__ ((aligned(32)));
llr_t m11[2*(n+16)] __attribute__ ((aligned(32)));
llr_t m10[2*(n+16)] __attribute__ ((aligned(32)));
int *pi2_p,*pi4_p,*pi5_p,*pi6_p;
llr_t *s,*s1,*s2,*yp1,*yp2,*yp,*yp_cw2;
uint32_t i,j,iind;//,pi;
uint8_t iteration_cnt=0;
uint32_t crc,oldcrc,crc_cw2,oldcrc_cw2,crc_len;
uint8_t temp;
uint32_t db;
__m128i *yp128,*yp128_cw2;
__m256i tmp, zeros=_mm256_setzero_si256();
__m128i tmpe,tmpe_cw2;
int offset8_flag=0;
#ifdef DEBUG_LOGMAP
fdavx2 = fopen("dump_avx2.txt","w");
fdavx2b = fopen("dump_avx2b.txt","w");
printf("tc avx2_16 (y,y2) %p,%p\n",y,y2);
#endif
if (crc_type > 3) {
printf("Illegal crc length!\n");
return 255;
}
start_meas(init_stats);
for (iind=0; iind < 188 && f1f2mat[iind].nb_bits != n; iind++);
if ( iind == 188 ) {
printf("Illegal frame length!\n");
return 255;
}
switch (crc_type) {
case CRC24_A:
case CRC24_B:
crc_len=3;
break;
case CRC16:
crc_len=2;
break;
case CRC8:
crc_len=1;
break;
default:
crc_len=3;
}
yp128 = (__m128i*)y;
yp128_cw2 = (__m128i*)y2;
s = systematic0;
s1 = systematic1;
s2 = systematic2;
yp1 = yparity1;
yp2 = yparity2;
#if 0
for (i=0; i<n; i+=8) {
pi2_p = &pi2tab16avx2[iind][i];
j=pi2_p[0];
tmpe = _mm_load_si128(yp128);
tmpe_cw2 = _mm_load_si128(yp128_cw2);
// fprintf(fdavx2,"yp128 %p\n",yp128);
// print_shorts("tmpe",(int16_t*)&tmpe);
s[j] = _mm_extract_epi16(tmpe,0);
yp1[j] = _mm_extract_epi16(tmpe,1);
yp2[j] = _mm_extract_epi16(tmpe,2);
s[j+8] = _mm_extract_epi16(tmpe_cw2,0);
yp1[j+8] = _mm_extract_epi16(tmpe_cw2,1);
yp2[j+8] = _mm_extract_epi16(tmpe_cw2,2);
#ifdef DEBUG_LOGMAP
fprintf(fdavx2,"init0: j %d, s[j] %d yp1[j] %d yp2[j] %d\n",((j>>4)<<3)+(j&7),s[j],yp1[j],yp2[j]);
fprintf(fdavx2b,"init0: j %d, s[j] %d yp1[j] %d yp2[j] %d\n",((j>>4)<<3)+(j&7),s[j+8],yp1[j+8],yp2[j+8]);
#endif
j=pi2_p[1];
s[j] = _mm_extract_epi16(tmpe,3);
yp1[j] = _mm_extract_epi16(tmpe,4);
yp2[j] = _mm_extract_epi16(tmpe,5);
s[j+8] = _mm_extract_epi16(tmpe_cw2,3);
yp1[j+8] = _mm_extract_epi16(tmpe_cw2,4);
yp2[j+8] = _mm_extract_epi16(tmpe_cw2,5);
#ifdef DEBUG_LOGMAP
fprintf(fdavx2,"init1: j %d, s[j] %d yp1[j] %d yp2[j] %d\n",((j>>4)<<3)+(j&7),s[j],yp1[j],yp2[j]);
fprintf(fdavx2b,"init1: j %d, s[j] %d yp1[j] %d yp2[j] %d\n",((j>>4)<<3)+(j&7),s[j+8],yp1[j+8],yp2[j+8]);
#endif
j=pi2_p[2];
s[j] = _mm_extract_epi16(tmpe,6);
yp1[j] = _mm_extract_epi16(tmpe,7);
tmpe = _mm_load_si128(&yp128[1]);
yp2[j] = _mm_extract_epi16(tmpe,0);
s[j+8] = _mm_extract_epi16(tmpe_cw2,6);
yp1[j+8] = _mm_extract_epi16(tmpe_cw2,7);
tmpe_cw2 = _mm_load_si128(&yp128_cw2[1]);
yp2[j+8] = _mm_extract_epi16(tmpe_cw2,0);
#ifdef DEBUG_LOGMAP
fprintf(fdavx2,"init2: j %d, s[j] %d yp1[j] %d yp2[j] %d\n",((j>>4)<<3)+(j&7),s[j],yp1[j],yp2[j]);
fprintf(fdavx2b,"init2: j %d, s[j] %d yp1[j] %d yp2[j] %d\n",((j>>4)<<3)+(j&7),s[j+8],yp1[j+8],yp2[j+8]);
#endif
j=pi2_p[3];
s[j] = _mm_extract_epi16(tmpe,1);
yp1[j] = _mm_extract_epi16(tmpe,2);
yp2[j] = _mm_extract_epi16(tmpe,3);
s[j+8] = _mm_extract_epi16(tmpe_cw2,1);
yp1[j+8] = _mm_extract_epi16(tmpe_cw2,2);
yp2[j+8] = _mm_extract_epi16(tmpe_cw2,3);
#ifdef DEBUG_LOGMAP
fprintf(fdavx2,"init3: j %d, s[j] %d yp1[j] %d yp2[j] %d\n",((j>>4)<<3)+(j&7),s[j],yp1[j],yp2[j]);
fprintf(fdavx2b,"init3: j %d, s[j] %d yp1[j] %d yp2[j] %d\n",((j>>4)<<3)+(j&7),s[j+8],yp1[j+8],yp2[j+8]);
#endif
j=pi2_p[4];
s[j] = _mm_extract_epi16(tmpe,4);
yp1[j] = _mm_extract_epi16(tmpe,5);
yp2[j] = _mm_extract_epi16(tmpe,6);
s[j+8] = _mm_extract_epi16(tmpe_cw2,4);
yp1[j+8] = _mm_extract_epi16(tmpe_cw2,5);
yp2[j+8] = _mm_extract_epi16(tmpe_cw2,6);
#ifdef DEBUG_LOGMAP
fprintf(fdavx2,"init4: j %d, s[j] %d yp1[j] %d yp2[j] %d\n",((j>>4)<<3)+(j&7),s[j],yp1[j],yp2[j]);
fprintf(fdavx2b,"init4: j %d, s[j] %d yp1[j] %d yp2[j] %d\n",((j>>4)<<3)+(j&7),s[j+8],yp1[j+8],yp2[j+8]);
#endif
j=pi2_p[5];
s[j] = _mm_extract_epi16(tmpe,7);
tmpe = _mm_load_si128(&yp128[2]);
yp1[j] = _mm_extract_epi16(tmpe,0);
yp2[j] = _mm_extract_epi16(tmpe,1);
s[j+8] = _mm_extract_epi16(tmpe_cw2,7);
tmpe_cw2 = _mm_load_si128(&yp128_cw2[2]);
yp1[j+8] = _mm_extract_epi16(tmpe_cw2,0);
yp2[j+8] = _mm_extract_epi16(tmpe_cw2,1);
#ifdef DEBUG_LOGMAP
fprintf(fdavx2,"init5: j %d, s[j] %d yp1[j] %d yp2[j] %d\n",((j>>4)<<3)+(j&7),s[j],yp1[j],yp2[j]);
fprintf(fdavx2b,"init5: j %d, s[j] %d yp1[j] %d yp2[j] %d\n",((j>>4)<<3)+(j&7),s[j+8],yp1[j+8],yp2[j+8]);
#endif
j=pi2_p[6];
s[j] = _mm_extract_epi16(tmpe,2);
yp1[j] = _mm_extract_epi16(tmpe,3);
yp2[j] = _mm_extract_epi16(tmpe,4);
s[j+8] = _mm_extract_epi16(tmpe_cw2,2);
yp1[j+8] = _mm_extract_epi16(tmpe_cw2,3);
yp2[j+8] = _mm_extract_epi16(tmpe_cw2,4);
#ifdef DEBUG_LOGMAP
fprintf(fdavx2,"init6: j %d, s[j] %d yp1[j] %d yp2[j] %d\n",((j>>4)<<3)+(j&7),s[j],yp1[j],yp2[j]);
fprintf(fdavx2b,"init6: j %d, s[j] %d yp1[j] %d yp2[j] %d\n",((j>>4)<<3)+(j&7),s[j+8],yp1[j+8],yp2[j+8]);
#endif
j=pi2_p[7];
s[j] = _mm_extract_epi16(tmpe,5);
yp1[j] = _mm_extract_epi16(tmpe,6);
yp2[j] = _mm_extract_epi16(tmpe,7);
s[j+8] = _mm_extract_epi16(tmpe_cw2,5);
yp1[j+8] = _mm_extract_epi16(tmpe_cw2,6);
yp2[j+8] = _mm_extract_epi16(tmpe_cw2,7);
#ifdef DEBUG_LOGMAP
fprintf(fdavx2,"init7: j %d, s[j] %d yp1[j] %d yp2[j] %d\n",((j>>4)<<3)+(j&7),s[j],yp1[j],yp2[j]);
fprintf(fdavx2b,"init7: j %d, s[j] %d yp1[j] %d yp2[j] %d\n",((j>>4)<<3)+(j&7),s[j+8],yp1[j+8],yp2[j+8]);
#endif
yp128+=3;
yp128_cw2+=3;
}
yp=(llr_t*)yp128;
yp_cw2=(llr_t*)yp128_cw2;
#else
pi2_p = &pi2tab16avx2[iind][0];
for (i=0,j=0; i<n; i++) {
s[*pi2_p] = y[j];
s[*pi2_p+8] = y2[j++];
yp1[*pi2_p] = y[j];
yp1[*pi2_p+8] = y2[j++];
yp2[*pi2_p] = y[j];
yp2[(*pi2_p++)+8] = y2[j++];
}
yp=(llr_t*)&y[j];
yp_cw2=(llr_t*)&y2[j];
#endif
// Termination
for (i=0; i<3; i++) {
s[(n<<1)+i] = *yp;
s1[(n<<1)+i] = *yp;
s2[(n<<1)+i] = *yp;
yp++;
yp1[(n<<1)+i] = *yp;
yp++;
s[(n<<1)+i+8] = *yp_cw2;
s1[(n<<1)+i+8] = *yp_cw2;
s2[(n<<1)+i+8] = *yp_cw2;
yp_cw2++;
yp1[(n<<1)+i+8] = *yp_cw2;
yp_cw2++;
#ifdef DEBUG_LOGMAP
fprintf(fdavx2,"Term 1 (%d): %d %d\n",n+i,s[(n<<1)+i],yp1[(n<<1)+i]);
fprintf(fdavx2b,"Term 1 (%d): %d %d\n",n+i,s[(n<<1)+i+8],yp1[(n<<1)+i+8]);
#endif //DEBUG_LOGMAP
}
for (i=16; i<19; i++) {
s[(n<<1)+i] = *yp;
s1[(n<<1)+i] = *yp;
s2[(n<<1)+i] = *yp;
yp++;
yp2[(n<<1)+(i-16)] = *yp;
yp++;
s[(n<<1)+i+8]= *yp_cw2;
s1[(n<<1)+i+8] = *yp_cw2 ;
s2[(n<<1)+i+8] = *yp_cw2;
yp_cw2++;
yp2[(n<<1)+i-16+8] = *yp_cw2;
yp_cw2++;
#ifdef DEBUG_LOGMAP
fprintf(fdavx2,"Term 2 (%d): %d %d\n",n+i-3-8,s[(n<<1)+i],yp2[(n<<1)+i-16]);
fprintf(fdavx2b,"Term 2 (%d): %d %d\n",n+i-3-8,s[(n<<1)+i+8],yp2[(n<<1)+i-16+8]);
#endif //DEBUG_LOGMAP
}
#ifdef DEBUG_LOGMAP
fprintf(fdavx2,"\n");
fprintf(fdavx2b,"\n");
#endif //DEBUG_LOGMAP
stop_meas(init_stats);
// do log_map from first parity bit
log_map16avx2(systematic0,yparity1,m11,m10,alpha,beta,ext,n,0,F,offset8_flag,alpha_stats,beta_stats,gamma_stats,ext_stats);
while (iteration_cnt++ < max_iterations) {
#ifdef DEBUG_LOGMAP
fprintf(fdavx2,"\n*******************ITERATION %d (n %d), ext %p\n\n",iteration_cnt,n,ext);
fprintf(fdavx2b,"\n*******************ITERATION %d (n %d), ext %p\n\n",iteration_cnt,n,ext);
#endif //DEBUG_LOGMAP
start_meas(intl1_stats);
pi4_p=pi4tab16avx2[iind];
for (i=0; i<(n>>3); i++) { // steady-state portion
((__m256i *)systematic2)[i]=_mm256_insert_epi16(((__m256i *)systematic2)[i],ext[*pi4_p],0);
((__m256i *)systematic2)[i]=_mm256_insert_epi16(((__m256i *)systematic2)[i],ext[8+*pi4_p++],8);
((__m256i *)systematic2)[i]=_mm256_insert_epi16(((__m256i *)systematic2)[i],ext[*pi4_p],1);
((__m256i *)systematic2)[i]=_mm256_insert_epi16(((__m256i *)systematic2)[i],ext[8+*pi4_p++],9);
((__m256i *)systematic2)[i]=_mm256_insert_epi16(((__m256i *)systematic2)[i],ext[*pi4_p],2);
((__m256i *)systematic2)[i]=_mm256_insert_epi16(((__m256i *)systematic2)[i],ext[8+*pi4_p++],10);
((__m256i *)systematic2)[i]=_mm256_insert_epi16(((__m256i *)systematic2)[i],ext[*pi4_p],3);
((__m256i *)systematic2)[i]=_mm256_insert_epi16(((__m256i *)systematic2)[i],ext[8+*pi4_p++],11);
((__m256i *)systematic2)[i]=_mm256_insert_epi16(((__m256i *)systematic2)[i],ext[*pi4_p],4);
((__m256i *)systematic2)[i]=_mm256_insert_epi16(((__m256i *)systematic2)[i],ext[8+*pi4_p++],12);
((__m256i *)systematic2)[i]=_mm256_insert_epi16(((__m256i *)systematic2)[i],ext[*pi4_p],5);
((__m256i *)systematic2)[i]=_mm256_insert_epi16(((__m256i *)systematic2)[i],ext[8+*pi4_p++],13);
((__m256i *)systematic2)[i]=_mm256_insert_epi16(((__m256i *)systematic2)[i],ext[*pi4_p],6);
((__m256i *)systematic2)[i]=_mm256_insert_epi16(((__m256i *)systematic2)[i],ext[8+*pi4_p++],14);
((__m256i *)systematic2)[i]=_mm256_insert_epi16(((__m256i *)systematic2)[i],ext[*pi4_p],7);
((__m256i *)systematic2)[i]=_mm256_insert_epi16(((__m256i *)systematic2)[i],ext[8+*pi4_p++],15);
#ifdef DEBUG_LOGMAP
print_shorts("syst2",(int16_t*)&((__m256i *)systematic2)[i]);
#endif
}
stop_meas(intl1_stats);
// do log_map from second parity bit
log_map16avx2(systematic2,yparity2,m11,m10,alpha,beta,ext2,n,1,F,offset8_flag,alpha_stats,beta_stats,gamma_stats,ext_stats);
pi5_p=pi5tab16avx2[iind];
for (i=0; i<(n>>3); i++) {
tmp=_mm256_insert_epi16(tmp,ext2[*pi5_p],0);
tmp=_mm256_insert_epi16(tmp,ext2[8+*pi5_p++],8);
tmp=_mm256_insert_epi16(tmp,ext2[*pi5_p],1);
tmp=_mm256_insert_epi16(tmp,ext2[8+*pi5_p++],9);
tmp=_mm256_insert_epi16(tmp,ext2[*pi5_p],2);
tmp=_mm256_insert_epi16(tmp,ext2[8+*pi5_p++],10);
tmp=_mm256_insert_epi16(tmp,ext2[*pi5_p],3);
tmp=_mm256_insert_epi16(tmp,ext2[8+*pi5_p++],11);
tmp=_mm256_insert_epi16(tmp,ext2[*pi5_p],4);
tmp=_mm256_insert_epi16(tmp,ext2[8+*pi5_p++],12);
tmp=_mm256_insert_epi16(tmp,ext2[*pi5_p],5);
tmp=_mm256_insert_epi16(tmp,ext2[8+*pi5_p++],13);
tmp=_mm256_insert_epi16(tmp,ext2[*pi5_p],6);
tmp=_mm256_insert_epi16(tmp,ext2[8+*pi5_p++],14);
tmp=_mm256_insert_epi16(tmp,ext2[*pi5_p],7);
tmp=_mm256_insert_epi16(tmp,ext2[8+*pi5_p++],15);
((__m256i *)systematic1)[i] = _mm256_adds_epi16(_mm256_subs_epi16(tmp,((__m256i*)ext)[i]),((__m256i *)systematic0)[i]);
#ifdef DEBUG_LOGMAP
print_shorts("syst1",(int16_t*)&((__m256i *)systematic1)[i]);
#endif
}
if (iteration_cnt>1) {
start_meas(intl2_stats);
pi6_p=pi6tab16avx2[iind];
for (i=0; i<(n>>3); i++) {
tmp=_mm256_insert_epi16(tmp, ((llr_t*)ext2)[*pi6_p],7);
tmp=_mm256_insert_epi16(tmp, ((llr_t*)ext2)[8+*pi6_p++],15);
tmp=_mm256_insert_epi16(tmp, ((llr_t*)ext2)[*pi6_p],6);
tmp=_mm256_insert_epi16(tmp, ((llr_t*)ext2)[8+*pi6_p++],14);
tmp=_mm256_insert_epi16(tmp, ((llr_t*)ext2)[*pi6_p],5);
tmp=_mm256_insert_epi16(tmp, ((llr_t*)ext2)[8+*pi6_p++],13);
tmp=_mm256_insert_epi16(tmp, ((llr_t*)ext2)[*pi6_p],4);
tmp=_mm256_insert_epi16(tmp, ((llr_t*)ext2)[8+*pi6_p++],12);
tmp=_mm256_insert_epi16(tmp, ((llr_t*)ext2)[*pi6_p],3);
tmp=_mm256_insert_epi16(tmp, ((llr_t*)ext2)[8+*pi6_p++],11);
tmp=_mm256_insert_epi16(tmp, ((llr_t*)ext2)[*pi6_p],2);
tmp=_mm256_insert_epi16(tmp, ((llr_t*)ext2)[8+*pi6_p++],10);
tmp=_mm256_insert_epi16(tmp, ((llr_t*)ext2)[*pi6_p],1);
tmp=_mm256_insert_epi16(tmp, ((llr_t*)ext2)[8+*pi6_p++],9);
tmp=_mm256_insert_epi16(tmp, ((llr_t*)ext2)[*pi6_p],0);
tmp=_mm256_insert_epi16(tmp, ((llr_t*)ext2)[8+*pi6_p++],8);
#ifdef DEBUG_LOGMAP
print_shorts("tmp",(int16_t*)&tmp);
#endif
tmp=_mm256_cmpgt_epi8(_mm256_packs_epi16(tmp,zeros),zeros);
db=(uint32_t)_mm256_movemask_epi8(tmp);
decoded_bytes[i]=db&0xff;
decoded_bytes2[i]=(uint8_t)(db>>16)&0xff;
#ifdef DEBUG_LOGMAP
print_shorts("tmp",(int16_t*)&tmp);
fprintf(fdavx2,"decoded_bytes[%d] %x (%x)\n",i,decoded_bytes[i],db);
fprintf(fdavx2b,"decoded_bytes[%d] %x (%x)\n",i,decoded_bytes2[i],db);
#endif
}
}
// check status on output
if (iteration_cnt>1) {
oldcrc= *((uint32_t *)(&decoded_bytes[(n>>3)-crc_len]));
switch (crc_type) {
case CRC24_A:
oldcrc&=0x00ffffff;
crc = crc24a(&decoded_bytes[F>>3],
n-24-F)>>8;
temp=((uint8_t *)&crc)[2];
((uint8_t *)&crc)[2] = ((uint8_t *)&crc)[0];
((uint8_t *)&crc)[0] = temp;
break;
case CRC24_B:
oldcrc&=0x00ffffff;
crc = crc24b(decoded_bytes,
n-24)>>8;
temp=((uint8_t *)&crc)[2];
((uint8_t *)&crc)[2] = ((uint8_t *)&crc)[0];
((uint8_t *)&crc)[0] = temp;
break;
case CRC16:
oldcrc&=0x0000ffff;
crc = crc16(decoded_bytes,
n-16)>>16;
break;
case CRC8:
oldcrc&=0x000000ff;
crc = crc8(decoded_bytes,
n-8)>>24;
break;
default:
printf("FATAL: 3gpplte_turbo_decoder_sse.c: Unknown CRC\n");
return(255);
break;
}
// second CW
oldcrc_cw2= *((uint32_t *)(&decoded_bytes2[(n>>3)-crc_len]));
switch (crc_type) {
case CRC24_A:
oldcrc_cw2&=0x00ffffff;
crc_cw2 = crc24a(&decoded_bytes2[F>>3],
n-24-F)>>8;
temp=((uint8_t *)&crc_cw2)[2];
((uint8_t *)&crc_cw2)[2] = ((uint8_t *)&crc_cw2)[0];
((uint8_t *)&crc_cw2)[0] = temp;
break;
case CRC24_B:
oldcrc_cw2&=0x00ffffff;
crc_cw2 = crc24b(decoded_bytes2,
n-24)>>8;
temp=((uint8_t *)&crc_cw2)[2];
((uint8_t *)&crc_cw2)[2] = ((uint8_t *)&crc_cw2)[0];
((uint8_t *)&crc_cw2)[0] = temp;
break;
case CRC16:
oldcrc_cw2&=0x0000ffff;
crc_cw2 = crc16(decoded_bytes2,
n-16)>>16;
break;
case CRC8:
oldcrc_cw2&=0x000000ff;
crc_cw2 = crc8(decoded_bytes2,
n-8)>>24;
break;
default:
printf("FATAL: 3gpplte_turbo_decoder_sse.c: Unknown CRC\n");
return(255);
break;
}
stop_meas(intl2_stats);
#ifdef DEBUG_LOGMAP
fprintf(fdavx2,"oldcrc %x, crc %x, oldcrc_cw2 %x, crc_cw2 %x\n",oldcrc,crc,oldcrc_cw2,crc_cw2);
fprintf(fdavx2b,"oldcrc %x, crc %x, oldcrc_cw2 %x, crc_cw2 %x\n",oldcrc,crc,oldcrc_cw2,crc_cw2);
#endif
if ((crc == oldcrc) && (crc!=0) && (crc_cw2 == oldcrc_cw2) && (crc_cw2!=0)) {
return(iteration_cnt);
}
}
// do log_map from first parity bit
if (iteration_cnt < max_iterations) {
log_map16avx2(systematic1,yparity1,m11,m10,alpha,beta,ext,n,0,F,offset8_flag,alpha_stats,beta_stats,gamma_stats,ext_stats);
__m256i* ext_128=(__m256i*) ext;
__m256i* s1_128=(__m256i*) systematic1;
__m256i* s0_128=(__m256i*) systematic0;
int myloop=n>>3;
for (i=0; i<myloop; i++) {
*ext_128=_mm256_adds_epi16(_mm256_subs_epi16(*ext_128,*s1_128++),*s0_128++);
ext_128++;
}
}
}
// fprintf(fdavx2,"crc %x, oldcrc %x\n",crc,oldcrc);
_mm_empty();
_m_empty();
#ifdef DEBUG_LOGMAP
fclose(fdavx2);
#endif
return(iteration_cnt);
}
#endif __AVX2__
...@@ -63,12 +63,18 @@ ...@@ -63,12 +63,18 @@
#include "mex.h" #include "mex.h"
#endif #endif
//#define DEBUG_LOGMAP
#define print_shorts(s,x) printf("%s %d,%d,%d,%d,%d,%d,%d,%d\n",s,(x)[0],(x)[1],(x)[2],(x)[3],(x)[4],(x)[5],(x)[6],(x)[7]) #ifdef DEBUG_LOGMAP
#define print_shorts(s,x) fprintf(fdsse4,"%s %d,%d,%d,%d,%d,%d,%d,%d\n",s,(x)[0],(x)[1],(x)[2],(x)[3],(x)[4],(x)[5],(x)[6],(x)[7])
#endif
//#define DEBUG_LOGMAP
#ifdef DEBUG_LOGMAP
FILE *fdsse4;
#endif
typedef int16_t llr_t; // internal decoder LLR data is 16-bit fixed typedef int16_t llr_t; // internal decoder LLR data is 16-bit fixed
typedef int16_t channel_t; typedef int16_t channel_t;
#define MAX 256 #define MAX 256
...@@ -99,7 +105,7 @@ void log_map16(llr_t* systematic, ...@@ -99,7 +105,7 @@ void log_map16(llr_t* systematic,
{ {
#ifdef DEBUG_LOGMAP #ifdef DEBUG_LOGMAP
msg("log_map, frame_length %d\n",frame_length); fprintf(fdsse4,"log_map, frame_length %d\n",frame_length);
#endif #endif
start_meas(gamma_stats) ; start_meas(gamma_stats) ;
...@@ -135,22 +141,35 @@ void compute_gamma16(llr_t* m11,llr_t* m10,llr_t* systematic,channel_t* y_parity ...@@ -135,22 +141,35 @@ void compute_gamma16(llr_t* m11,llr_t* m10,llr_t* systematic,channel_t* y_parity
#endif #endif
#ifdef DEBUG_LOGMAP #ifdef DEBUG_LOGMAP
msg("compute_gamma, %p,%p,%p,%p,framelength %d\n",m11,m10,systematic,y_parity,frame_length); fprintf(fdsse4,"compute_gamma (sse_16bit), %p,%p,%p,%p,framelength %d\n",m11,m10,systematic,y_parity,frame_length);
#endif #endif
#ifndef __AVX2__
K1=frame_length>>3; K1=frame_length>>3;
#else
if ((frame_length&15) > 0)
K1=(frame_length+1)>>4;
else
K1=frame_length>>4;
#endif
for (k=0; k<K1; k++) { for (k=0; k<K1; k++) {
#if defined(__x86_64__) || defined(__i386__) #if defined(__x86_64__) || defined(__i386__)
#ifndef __AVX2__
m11_128[k] = _mm_srai_epi16(_mm_adds_epi16(systematic128[k],y_parity128[k]),1); m11_128[k] = _mm_srai_epi16(_mm_adds_epi16(systematic128[k],y_parity128[k]),1);
m10_128[k] = _mm_srai_epi16(_mm_subs_epi16(systematic128[k],y_parity128[k]),1); m10_128[k] = _mm_srai_epi16(_mm_subs_epi16(systematic128[k],y_parity128[k]),1);
#else
((__m256i*)m11_128)[k] = _mm256_srai_epi16(_mm256_adds_epi16(((__m256i*)systematic128)[k],((__m256i*)y_parity128)[k]),1);
// ((__m256i*)m10_128)[k] = _mm256_srai_epi16(_mm256_subs_epi16(((__m256i*)y_parity128)[k],((__m256i*)systematic128)[k]),1);
((__m256i*)m10_128)[k] = _mm256_srai_epi16(_mm256_subs_epi16(((__m256i*)systematic128)[k],((__m256i*)y_parity128)[k]),1);
#endif
#elif defined(__arm__) #elif defined(__arm__)
m11_128[k] = vhaddq_s16(systematic128[k],y_parity128[k]); m11_128[k] = vhaddq_s16(systematic128[k],y_parity128[k]);
m10_128[k] = vhsubq_s16(systematic128[k],y_parity128[k]); m10_128[k] = vhsubq_s16(systematic128[k],y_parity128[k]);
#endif #endif
#ifdef DEBUG_LOGMAP #ifdef DEBUG_LOGMAP
printf("Loop index k, m11,m10\n"); fprintf(fdsse4,"Loop index k %d\n", k);
print_shorts("sys",(int16_t*)&systematic128[k]); print_shorts("sys",(int16_t*)&systematic128[k]);
print_shorts("yp",(int16_t*)&y_parity128[k]); print_shorts("yp",(int16_t*)&y_parity128[k]);
print_shorts("m11",(int16_t*)&m11_128[k]); print_shorts("m11",(int16_t*)&m11_128[k]);
...@@ -158,14 +177,28 @@ void compute_gamma16(llr_t* m11,llr_t* m10,llr_t* systematic,channel_t* y_parity ...@@ -158,14 +177,28 @@ void compute_gamma16(llr_t* m11,llr_t* m10,llr_t* systematic,channel_t* y_parity
#endif #endif
} }
k=frame_length>>3;
// Termination // Termination
#if defined(__x86_64__) || defined(__i386__) #if defined(__x86_64__) || defined(__i386__)
m11_128[k] = _mm_srai_epi16(_mm_adds_epi16(systematic128[k+term_flag],y_parity128[k]),1); m11_128[k] = _mm_srai_epi16(_mm_adds_epi16(systematic128[k+term_flag],y_parity128[k]),1);
//#ifndef __AVX2__
#if 1
m10_128[k] = _mm_srai_epi16(_mm_subs_epi16(systematic128[k+term_flag],y_parity128[k]),1); m10_128[k] = _mm_srai_epi16(_mm_subs_epi16(systematic128[k+term_flag],y_parity128[k]),1);
#else
m10_128[k] = _mm_srai_epi16(_mm_subs_epi16(y_parity128[k],systematic128[k+term_flag]),1);
#endif
#elif defined(__arm__) #elif defined(__arm__)
m11_128[k] = vhaddq_s16(systematic128[k+term_flag],y_parity128[k]); m11_128[k] = vhaddq_s16(systematic128[k+term_flag],y_parity128[k]);
m10_128[k] = vhsubq_s16(systematic128[k+term_flag],y_parity128[k]); m10_128[k] = vhsubq_s16(systematic128[k+term_flag],y_parity128[k]);
#endif #endif
#ifdef DEBUG_LOGMAP
fprintf(fdsse4,"Loop index k %d (term flag %d)\n", k,term_flag);
print_shorts("sys",(int16_t*)&systematic128[k]);
print_shorts("yp",(int16_t*)&y_parity128[k]);
print_shorts("m11",(int16_t*)&m11_128[k]);
print_shorts("m10",(int16_t*)&m10_128[k]);
#endif
} }
#define L 40 #define L 40
...@@ -174,11 +207,21 @@ void compute_alpha16(llr_t* alpha,llr_t* beta,llr_t* m_11,llr_t* m_10,unsigned s ...@@ -174,11 +207,21 @@ void compute_alpha16(llr_t* alpha,llr_t* beta,llr_t* m_11,llr_t* m_10,unsigned s
{ {
int k,l,l2,K1,rerun_flag=0; int k,l,l2,K1,rerun_flag=0;
#if defined(__x86_64__) || defined(__i386__) #if defined(__x86_64__) || defined(__i386__)
__m128i *alpha128=(__m128i *)alpha,*alpha_ptr; __m128i *alpha128=(__m128i *)alpha,*alpha_ptr,*m11p,*m10p;
__m128i a0,a1,a2,a3,a4,a5,a6,a7,*m11p,*m10p; //#ifndef __AVX2__
#if 1
__m128i a0,a1,a2,a3,a4,a5,a6,a7;
__m128i m_b0,m_b1,m_b2,m_b3,m_b4,m_b5,m_b6,m_b7; __m128i m_b0,m_b1,m_b2,m_b3,m_b4,m_b5,m_b6,m_b7;
__m128i new0,new1,new2,new3,new4,new5,new6,new7; __m128i new0,new1,new2,new3,new4,new5,new6,new7;
__m128i alpha_max; __m128i alpha_max;
#else
__m256i *alpha256=(__m256i *)alpha,*alpha_ptr256,m11,m10;
__m256i a01,a23,a45,a67,a02,a13,a64,a75;
__m256i m_b01,m_b23,m_b45,m_b67,new01,new23,new45,new67;
__m256i m11m10_256;
__m256i alpha_max;
#endif
#elif defined(__arm__) #elif defined(__arm__)
int16x8_t *alpha128=(int16x8_t *)alpha,*alpha_ptr; int16x8_t *alpha128=(int16x8_t *)alpha,*alpha_ptr;
int16x8_t a0,a1,a2,a3,a4,a5,a6,a7,*m11p,*m10p; int16x8_t a0,a1,a2,a3,a4,a5,a6,a7,*m11p,*m10p;
...@@ -188,10 +231,16 @@ void compute_alpha16(llr_t* alpha,llr_t* beta,llr_t* m_11,llr_t* m_10,unsigned s ...@@ -188,10 +231,16 @@ void compute_alpha16(llr_t* alpha,llr_t* beta,llr_t* m_11,llr_t* m_10,unsigned s
#endif #endif
l2 = L>>3; l2 = L>>3;
K1 = (frame_length>>3); K1 = (frame_length>>3);
#ifdef DEBUG_LOGMAP
fprintf(fdsse4,"compute_alpha (sse_16bit)\n");
#endif
for (l=K1;; l=l2,rerun_flag=1) { for (l=K1;; l=l2,rerun_flag=1) {
#if defined(__x86_64__) || defined(__i386__) #if defined(__x86_64__) || defined(__i386__)
alpha128 = (__m128i *)alpha; alpha128 = (__m128i *)alpha;
//#ifdef __AVX2__
#if 0
alpha256 = (__m256i *)alpha;
#endif
#elif defined(__arm__) #elif defined(__arm__)
alpha128 = (int16x8_t *)alpha; alpha128 = (int16x8_t *)alpha;
#endif #endif
...@@ -218,7 +267,7 @@ void compute_alpha16(llr_t* alpha,llr_t* beta,llr_t* m_11,llr_t* m_10,unsigned s ...@@ -218,7 +267,7 @@ void compute_alpha16(llr_t* alpha,llr_t* beta,llr_t* m_11,llr_t* m_10,unsigned s
alpha128[7] = vdupq_n_s16(-MAX/2); alpha128[7] = vdupq_n_s16(-MAX/2);
#endif #endif
#ifdef DEBUG_LOGMAP #ifdef DEBUG_LOGMAP
printf("Initial alpha\n"); fprintf(fdsse4,"Initial alpha\n");
print_shorts("a0",(int16_t*)&alpha128[0]); print_shorts("a0",(int16_t*)&alpha128[0]);
print_shorts("a1",(int16_t*)&alpha128[1]); print_shorts("a1",(int16_t*)&alpha128[1]);
print_shorts("a2",(int16_t*)&alpha128[2]); print_shorts("a2",(int16_t*)&alpha128[2]);
...@@ -258,7 +307,7 @@ void compute_alpha16(llr_t* alpha,llr_t* beta,llr_t* m_11,llr_t* m_10,unsigned s ...@@ -258,7 +307,7 @@ void compute_alpha16(llr_t* alpha,llr_t* beta,llr_t* m_11,llr_t* m_10,unsigned s
alpha[48] = -MAX/2; alpha[48] = -MAX/2;
alpha[56] = -MAX/2; alpha[56] = -MAX/2;
#ifdef DEBUG_LOGMAP #ifdef DEBUG_LOGMAP
printf("Second run\n"); fprintf(fdsse4,"Second run\n");
print_shorts("a0",(int16_t*)&alpha128[0]); print_shorts("a0",(int16_t*)&alpha128[0]);
print_shorts("a1",(int16_t*)&alpha128[1]); print_shorts("a1",(int16_t*)&alpha128[1]);
print_shorts("a2",(int16_t*)&alpha128[2]); print_shorts("a2",(int16_t*)&alpha128[2]);
...@@ -272,6 +321,11 @@ void compute_alpha16(llr_t* alpha,llr_t* beta,llr_t* m_11,llr_t* m_10,unsigned s ...@@ -272,6 +321,11 @@ void compute_alpha16(llr_t* alpha,llr_t* beta,llr_t* m_11,llr_t* m_10,unsigned s
} }
alpha_ptr = &alpha128[0]; alpha_ptr = &alpha128[0];
//#ifdef __AVX2__
#if 0
alpha_ptr256 = &alpha256[0];
#endif
#if defined(__x86_64__) || defined(__i386__) #if defined(__x86_64__) || defined(__i386__)
m11p = (__m128i*)m_11; m11p = (__m128i*)m_11;
m10p = (__m128i*)m_10; m10p = (__m128i*)m_10;
...@@ -284,6 +338,8 @@ void compute_alpha16(llr_t* alpha,llr_t* beta,llr_t* m_11,llr_t* m_10,unsigned s ...@@ -284,6 +338,8 @@ void compute_alpha16(llr_t* alpha,llr_t* beta,llr_t* m_11,llr_t* m_10,unsigned s
k++) { k++) {
#if defined(__x86_64__) || defined(__i386__) #if defined(__x86_64__) || defined(__i386__)
//#ifndef __AVX2__
#if 1
a1=_mm_load_si128(&alpha_ptr[1]); a1=_mm_load_si128(&alpha_ptr[1]);
a3=_mm_load_si128(&alpha_ptr[3]); a3=_mm_load_si128(&alpha_ptr[3]);
a5=_mm_load_si128(&alpha_ptr[5]); a5=_mm_load_si128(&alpha_ptr[5]);
...@@ -328,6 +384,37 @@ void compute_alpha16(llr_t* alpha,llr_t* beta,llr_t* m_11,llr_t* m_10,unsigned s ...@@ -328,6 +384,37 @@ void compute_alpha16(llr_t* alpha,llr_t* beta,llr_t* m_11,llr_t* m_10,unsigned s
alpha_max = _mm_max_epi16(alpha_max,a5); alpha_max = _mm_max_epi16(alpha_max,a5);
alpha_max = _mm_max_epi16(alpha_max,a6); alpha_max = _mm_max_epi16(alpha_max,a6);
alpha_max = _mm_max_epi16(alpha_max,a7); alpha_max = _mm_max_epi16(alpha_max,a7);
#else
a02=_mm256_load_si256(&alpha_ptr256[0]);
a13=_mm256_load_si256(&alpha_ptr256[1]);
a64=_mm256_load_si256(&alpha_ptr256[2]);
a75=_mm256_load_si256(&alpha_ptr256[3]);
m11m10_256 = _mm256_insertf128_si256(m11m10_256,*m11p,0);
m11m10_256 = _mm256_insertf128_si256(m11m10_256,*m10p,1);
m_b01 = _mm256_adds_epi16(a13,m11m10_256); //negative m10
m_b23 = _mm256_subs_epi16(a75,m11m10_256); //negative m10
m_b45 = _mm256_subs_epi16(a13,m11m10_256); //negative m10
m_b67 = _mm256_adds_epi16(a75,m11m10_256); //negative m10
new01 = _mm256_subs_epi16(a02,m11m10_256); //negative m10
new23 = _mm256_adds_epi16(a64,m11m10_256); //negative m10
new45 = _mm256_adds_epi16(a02,m11m10_256); //negative m10
new67 = _mm256_subs_epi16(a64,m11m10_256); //negative m10
a01 = _mm256_max_epi16(m_b01,new01);
a23 = _mm256_max_epi16(m_b23,new23);
a45 = _mm256_max_epi16(m_b45,new45);
a67 = _mm256_max_epi16(m_b67,new67);
alpha_max = _mm256_max_epi16(a01,a23);
alpha_max = _mm256_max_epi16(alpha_max,a45);
alpha_max = _mm256_max_epi16(alpha_max,a67);
alpha_max = _mm256_max_epi16(alpha_max,_mm256_permutevar8x32_epi32(alpha_max,_mm256_set_epi32(3,2,1,0,7,6,5,4)));
#endif
#elif defined(__arm__) #elif defined(__arm__)
m_b0 = vqaddq_s16(alpha_ptr[1],*m11p); // m11 m_b0 = vqaddq_s16(alpha_ptr[1],*m11p); // m11
m_b4 = vqsubq_s16(alpha_ptr[1],*m11p); // m00=-m11 m_b4 = vqsubq_s16(alpha_ptr[1],*m11p); // m00=-m11
...@@ -367,9 +454,15 @@ void compute_alpha16(llr_t* alpha,llr_t* beta,llr_t* m_11,llr_t* m_10,unsigned s ...@@ -367,9 +454,15 @@ void compute_alpha16(llr_t* alpha,llr_t* beta,llr_t* m_11,llr_t* m_10,unsigned s
#endif #endif
alpha_ptr+=8; alpha_ptr+=8;
//#ifdef __AVX2__
#if 0
alpha_ptr256+=4;
#endif
m11p++; m11p++;
m10p++; m10p++;
#if defined(__x86_64__) || defined(__i386__) #if defined(__x86_64__) || defined(__i386__)
//#ifndef __AVX2__
#if 1
alpha_ptr[0] = _mm_subs_epi16(a0,alpha_max); alpha_ptr[0] = _mm_subs_epi16(a0,alpha_max);
alpha_ptr[1] = _mm_subs_epi16(a1,alpha_max); alpha_ptr[1] = _mm_subs_epi16(a1,alpha_max);
alpha_ptr[2] = _mm_subs_epi16(a2,alpha_max); alpha_ptr[2] = _mm_subs_epi16(a2,alpha_max);
...@@ -378,6 +471,18 @@ void compute_alpha16(llr_t* alpha,llr_t* beta,llr_t* m_11,llr_t* m_10,unsigned s ...@@ -378,6 +471,18 @@ void compute_alpha16(llr_t* alpha,llr_t* beta,llr_t* m_11,llr_t* m_10,unsigned s
alpha_ptr[5] = _mm_subs_epi16(a5,alpha_max); alpha_ptr[5] = _mm_subs_epi16(a5,alpha_max);
alpha_ptr[6] = _mm_subs_epi16(a6,alpha_max); alpha_ptr[6] = _mm_subs_epi16(a6,alpha_max);
alpha_ptr[7] = _mm_subs_epi16(a7,alpha_max); alpha_ptr[7] = _mm_subs_epi16(a7,alpha_max);
#else
a01 = _mm256_subs_epi16(a01,alpha_max);
a23 = _mm256_subs_epi16(a23,alpha_max);
a45 = _mm256_subs_epi16(a45,alpha_max);
a67 = _mm256_subs_epi16(a67,alpha_max);
alpha_ptr256[0] = _mm256_permute2x128_si256(a01,a23,0x20); //a02
alpha_ptr256[1] = _mm256_permute2x128_si256(a01,a23,0x13); //a13
alpha_ptr256[2] = _mm256_permute2x128_si256(a45,a67,0x02); //a64
alpha_ptr256[3] = _mm256_permute2x128_si256(a45,a67,0x31); //a75
#endif
#elif defined(__arm__) #elif defined(__arm__)
alpha_ptr[0] = vqsubq_s16(a0,alpha_max); alpha_ptr[0] = vqsubq_s16(a0,alpha_max);
alpha_ptr[1] = vqsubq_s16(a1,alpha_max); alpha_ptr[1] = vqsubq_s16(a1,alpha_max);
...@@ -390,7 +495,7 @@ void compute_alpha16(llr_t* alpha,llr_t* beta,llr_t* m_11,llr_t* m_10,unsigned s ...@@ -390,7 +495,7 @@ void compute_alpha16(llr_t* alpha,llr_t* beta,llr_t* m_11,llr_t* m_10,unsigned s
#endif #endif
#ifdef DEBUG_LOGMAP #ifdef DEBUG_LOGMAP
printf("Loop index %d, mb\n",k); fprintf(fdsse4,"Loop index %d\n",k);
print_shorts("mb0",(int16_t*)&m_b0); print_shorts("mb0",(int16_t*)&m_b0);
print_shorts("mb1",(int16_t*)&m_b1); print_shorts("mb1",(int16_t*)&m_b1);
print_shorts("mb2",(int16_t*)&m_b2); print_shorts("mb2",(int16_t*)&m_b2);
...@@ -400,7 +505,7 @@ void compute_alpha16(llr_t* alpha,llr_t* beta,llr_t* m_11,llr_t* m_10,unsigned s ...@@ -400,7 +505,7 @@ void compute_alpha16(llr_t* alpha,llr_t* beta,llr_t* m_11,llr_t* m_10,unsigned s
print_shorts("mb6",(int16_t*)&m_b6); print_shorts("mb6",(int16_t*)&m_b6);
print_shorts("mb7",(int16_t*)&m_b7); print_shorts("mb7",(int16_t*)&m_b7);
printf("Loop index %d, new\n",k); fprintf(fdsse4,"Loop index %d, new\n",k);
print_shorts("new0",(int16_t*)&new0); print_shorts("new0",(int16_t*)&new0);
print_shorts("new1",(int16_t*)&new1); print_shorts("new1",(int16_t*)&new1);
print_shorts("new2",(int16_t*)&new2); print_shorts("new2",(int16_t*)&new2);
...@@ -410,7 +515,7 @@ void compute_alpha16(llr_t* alpha,llr_t* beta,llr_t* m_11,llr_t* m_10,unsigned s ...@@ -410,7 +515,7 @@ void compute_alpha16(llr_t* alpha,llr_t* beta,llr_t* m_11,llr_t* m_10,unsigned s
print_shorts("new6",(int16_t*)&new6); print_shorts("new6",(int16_t*)&new6);
print_shorts("new7",(int16_t*)&new7); print_shorts("new7",(int16_t*)&new7);
printf("Loop index %d, after max\n",k); fprintf(fdsse4,"Loop index %d, after max\n",k);
print_shorts("a0",(int16_t*)&a0); print_shorts("a0",(int16_t*)&a0);
print_shorts("a1",(int16_t*)&a1); print_shorts("a1",(int16_t*)&a1);
print_shorts("a2",(int16_t*)&a2); print_shorts("a2",(int16_t*)&a2);
...@@ -420,7 +525,7 @@ void compute_alpha16(llr_t* alpha,llr_t* beta,llr_t* m_11,llr_t* m_10,unsigned s ...@@ -420,7 +525,7 @@ void compute_alpha16(llr_t* alpha,llr_t* beta,llr_t* m_11,llr_t* m_10,unsigned s
print_shorts("a6",(int16_t*)&a6); print_shorts("a6",(int16_t*)&a6);
print_shorts("a7",(int16_t*)&a7); print_shorts("a7",(int16_t*)&a7);
printf("Loop index %d\n",k); fprintf(fdsse4,"Loop index %d\n",k);
print_shorts("a0",(int16_t*)&alpha_ptr[0]); print_shorts("a0",(int16_t*)&alpha_ptr[0]);
print_shorts("a1",(int16_t*)&alpha_ptr[1]); print_shorts("a1",(int16_t*)&alpha_ptr[1]);
print_shorts("a2",(int16_t*)&alpha_ptr[2]); print_shorts("a2",(int16_t*)&alpha_ptr[2]);
...@@ -463,25 +568,33 @@ void compute_beta16(llr_t* alpha,llr_t* beta,llr_t *m_11,llr_t* m_10,unsigned sh ...@@ -463,25 +568,33 @@ void compute_beta16(llr_t* alpha,llr_t* beta,llr_t *m_11,llr_t* m_10,unsigned sh
llr_t beta0,beta1; llr_t beta0,beta1;
#ifdef DEBUG_LOGMAP #ifdef DEBUG_LOGMAP
msg("compute_beta, %p,%p,%p,%p,framelength %d,F %d\n", fprintf(fdsse4,"compute_beta, %p,%p,%p,%p,framelength %d,F %d\n",
beta,m_11,m_10,alpha,frame_length,F); beta,m_11,m_10,alpha,frame_length,F);
#endif #endif
// termination for beta initialization // termination for beta initialization
// printf("beta init: offset8 %d\n",offset8_flag); // fprintf(fdsse4,"beta init: offset8 %d\n",offset8_flag);
m11=(int16_t)m_11[2+frame_length]; m11=(int16_t)m_11[2+frame_length];
//#ifndef __AVX2__
#if 1
m10=(int16_t)m_10[2+frame_length]; m10=(int16_t)m_10[2+frame_length];
#else
// printf("m11,m10 %d,%d\n",m11,m10); m10=-(int16_t)m_10[2+frame_length];
#endif
#ifdef DEBUG_LOGMAP
fprintf(fdsse4,"m11,m10 %d,%d\n",m11,m10);
#endif
beta0 = -m11;//M0T_TERM; beta0 = -m11;//M0T_TERM;
beta1 = m11;//M1T_TERM; beta1 = m11;//M1T_TERM;
m11=(int16_t)m_11[1+frame_length]; m11=(int16_t)m_11[1+frame_length];
m10=(int16_t)m_10[1+frame_length]; m10=(int16_t)m_10[1+frame_length];
// printf("m11,m10 %d,%d\n",m11,m10); #ifdef DEBUG_LOGMAP
fprintf(fdsse4,"m11,m10 %d,%d\n",m11,m10);
#endif
beta0_2 = beta0-m11;//+M0T_TERM; beta0_2 = beta0-m11;//+M0T_TERM;
beta1_2 = beta0+m11;//+M1T_TERM; beta1_2 = beta0+m11;//+M1T_TERM;
...@@ -489,8 +602,9 @@ void compute_beta16(llr_t* alpha,llr_t* beta,llr_t *m_11,llr_t* m_10,unsigned sh ...@@ -489,8 +602,9 @@ void compute_beta16(llr_t* alpha,llr_t* beta,llr_t *m_11,llr_t* m_10,unsigned sh
beta3_2 = beta1-m10;//+M3T_TERM; beta3_2 = beta1-m10;//+M3T_TERM;
m11=(int16_t)m_11[frame_length]; m11=(int16_t)m_11[frame_length];
m10=(int16_t)m_10[frame_length]; m10=(int16_t)m_10[frame_length];
// printf("m11,m10 %d,%d (%p)\n",m11,m10,m_11+frame_length); #ifdef DEBUG_LOGMAP
fprintf(fdsse4,"m11,m10 %d,%d\n",m11,m10);
#endif
beta0_16 = beta0_2-m11;//+M0T_TERM; beta0_16 = beta0_2-m11;//+M0T_TERM;
beta1_16 = beta0_2+m11;//+M1T_TERM; beta1_16 = beta0_2+m11;//+M1T_TERM;
beta2_16 = beta1_2+m10;//+M2T_TERM; beta2_16 = beta1_2+m10;//+M2T_TERM;
...@@ -536,6 +650,17 @@ void compute_beta16(llr_t* alpha,llr_t* beta,llr_t *m_11,llr_t* m_10,unsigned sh ...@@ -536,6 +650,17 @@ void compute_beta16(llr_t* alpha,llr_t* beta,llr_t *m_11,llr_t* m_10,unsigned sh
beta_ptr[5] = alpha128[5+(frame_length)]; beta_ptr[5] = alpha128[5+(frame_length)];
beta_ptr[6] = alpha128[6+(frame_length)]; beta_ptr[6] = alpha128[6+(frame_length)];
beta_ptr[7] = alpha128[7+(frame_length)]; beta_ptr[7] = alpha128[7+(frame_length)];
#ifdef DEBUG_LOGMAP
fprintf(fdsse4,"beta init \n");
print_shorts("b0",(int16_t*)&beta_ptr[0]);
print_shorts("b1",(int16_t*)&beta_ptr[1]);
print_shorts("b2",(int16_t*)&beta_ptr[2]);
print_shorts("b3",(int16_t*)&beta_ptr[3]);
print_shorts("b4",(int16_t*)&beta_ptr[4]);
print_shorts("b5",(int16_t*)&beta_ptr[5]);
print_shorts("b6",(int16_t*)&beta_ptr[6]);
print_shorts("b7",(int16_t*)&beta_ptr[7]);
#endif
} else { } else {
#if defined(__x86_64__) || defined(__i386__) #if defined(__x86_64__) || defined(__i386__)
beta128 = (__m128i*)&beta[0]; beta128 = (__m128i*)&beta[0];
...@@ -558,6 +683,17 @@ void compute_beta16(llr_t* alpha,llr_t* beta,llr_t *m_11,llr_t* m_10,unsigned sh ...@@ -558,6 +683,17 @@ void compute_beta16(llr_t* alpha,llr_t* beta,llr_t *m_11,llr_t* m_10,unsigned sh
beta_ptr[5] = (int16x8_t)vshrq_n_s64((int64x2_t)beta128[5],16); beta_ptr[5] = vsetq_lane_s16(beta[43],beta_ptr[5],4); beta_ptr[5] = (int16x8_t)vshrq_n_s64((int64x2_t)beta128[5],16); beta_ptr[5] = vsetq_lane_s16(beta[43],beta_ptr[5],4);
beta_ptr[6] = (int16x8_t)vshrq_n_s64((int64x2_t)beta128[6],16); beta_ptr[6] = vsetq_lane_s16(beta[51],beta_ptr[6],4); beta_ptr[6] = (int16x8_t)vshrq_n_s64((int64x2_t)beta128[6],16); beta_ptr[6] = vsetq_lane_s16(beta[51],beta_ptr[6],4);
beta_ptr[7] = (int16x8_t)vshrq_n_s64((int64x2_t)beta128[7],16); beta_ptr[7] = vsetq_lane_s16(beta[59],beta_ptr[7],4); beta_ptr[7] = (int16x8_t)vshrq_n_s64((int64x2_t)beta128[7],16); beta_ptr[7] = vsetq_lane_s16(beta[59],beta_ptr[7],4);
#endif
#ifdef DEBUG_LOGMAP
fprintf(fdsse4,"beta init (second run) \n");
print_shorts("b0",(int16_t*)&beta_ptr[0]);
print_shorts("b1",(int16_t*)&beta_ptr[1]);
print_shorts("b2",(int16_t*)&beta_ptr[2]);
print_shorts("b3",(int16_t*)&beta_ptr[3]);
print_shorts("b4",(int16_t*)&beta_ptr[4]);
print_shorts("b5",(int16_t*)&beta_ptr[5]);
print_shorts("b6",(int16_t*)&beta_ptr[6]);
print_shorts("b7",(int16_t*)&beta_ptr[7]);
#endif #endif
} }
...@@ -582,6 +718,17 @@ void compute_beta16(llr_t* alpha,llr_t* beta,llr_t *m_11,llr_t* m_10,unsigned sh ...@@ -582,6 +718,17 @@ void compute_beta16(llr_t* alpha,llr_t* beta,llr_t *m_11,llr_t* m_10,unsigned sh
beta_ptr[7] = vsetq_lane_s16(beta7_16,beta_ptr[7],7); beta_ptr[7] = vsetq_lane_s16(beta7_16,beta_ptr[7],7);
#endif #endif
#ifdef DEBUG_LOGMAP
fprintf(fdsse4,"beta init (after insert) \n");
print_shorts("b0",(int16_t*)&beta_ptr[0]);
print_shorts("b1",(int16_t*)&beta_ptr[1]);
print_shorts("b2",(int16_t*)&beta_ptr[2]);
print_shorts("b3",(int16_t*)&beta_ptr[3]);
print_shorts("b4",(int16_t*)&beta_ptr[4]);
print_shorts("b5",(int16_t*)&beta_ptr[5]);
print_shorts("b6",(int16_t*)&beta_ptr[6]);
print_shorts("b7",(int16_t*)&beta_ptr[7]);
#endif
int loopval=((rerun_flag==0)?0:((frame_length-L)>>3)); int loopval=((rerun_flag==0)?0:((frame_length-L)>>3));
for (k=(frame_length>>3)-1; k>=loopval; k--) { for (k=(frame_length>>3)-1; k>=loopval; k--) {
...@@ -589,6 +736,9 @@ void compute_beta16(llr_t* alpha,llr_t* beta,llr_t *m_11,llr_t* m_10,unsigned sh ...@@ -589,6 +736,9 @@ void compute_beta16(llr_t* alpha,llr_t* beta,llr_t *m_11,llr_t* m_10,unsigned sh
m11_128=((__m128i*)m_11)[k]; m11_128=((__m128i*)m_11)[k];
m10_128=((__m128i*)m_10)[k]; m10_128=((__m128i*)m_10)[k];
//#ifndef __AVX2__
#if 1
m_b0 = _mm_adds_epi16(beta_ptr[4],m11_128); //m11 m_b0 = _mm_adds_epi16(beta_ptr[4],m11_128); //m11
m_b1 = _mm_subs_epi16(beta_ptr[4],m11_128); //m00 m_b1 = _mm_subs_epi16(beta_ptr[4],m11_128); //m00
m_b2 = _mm_subs_epi16(beta_ptr[5],m10_128); //m01 m_b2 = _mm_subs_epi16(beta_ptr[5],m10_128); //m01
...@@ -598,6 +748,7 @@ void compute_beta16(llr_t* alpha,llr_t* beta,llr_t *m_11,llr_t* m_10,unsigned sh ...@@ -598,6 +748,7 @@ void compute_beta16(llr_t* alpha,llr_t* beta,llr_t *m_11,llr_t* m_10,unsigned sh
m_b6 = _mm_subs_epi16(beta_ptr[7],m11_128); //m00 m_b6 = _mm_subs_epi16(beta_ptr[7],m11_128); //m00
m_b7 = _mm_adds_epi16(beta_ptr[7],m11_128); //m11 m_b7 = _mm_adds_epi16(beta_ptr[7],m11_128); //m11
new0 = _mm_subs_epi16(beta_ptr[0],m11_128); //m00 new0 = _mm_subs_epi16(beta_ptr[0],m11_128); //m00
new1 = _mm_adds_epi16(beta_ptr[0],m11_128); //m11 new1 = _mm_adds_epi16(beta_ptr[0],m11_128); //m11
new2 = _mm_adds_epi16(beta_ptr[1],m10_128); //m10 new2 = _mm_adds_epi16(beta_ptr[1],m10_128); //m10
...@@ -607,8 +758,29 @@ void compute_beta16(llr_t* alpha,llr_t* beta,llr_t *m_11,llr_t* m_10,unsigned sh ...@@ -607,8 +758,29 @@ void compute_beta16(llr_t* alpha,llr_t* beta,llr_t *m_11,llr_t* m_10,unsigned sh
new6 = _mm_adds_epi16(beta_ptr[3],m11_128); //m11 new6 = _mm_adds_epi16(beta_ptr[3],m11_128); //m11
new7 = _mm_subs_epi16(beta_ptr[3],m11_128); //m00 new7 = _mm_subs_epi16(beta_ptr[3],m11_128); //m00
#else
b01=_mm256_load_si256(&((_m256i*)beta_ptr)[0]);
b23=_mm256_load_si256(&((_m256i*)beta_ptr)[1]);
b45=_mm256_load_si256(&((_m256i*)beta_ptr)[2]);
b67=_mm256_load_si256(&((_m256i*)beta_ptr)[3]);
m11m10_256 = _mm256_insertf128_si256(m11m10_256,m11_128,0);
m11m10_256 = _mm256_insertf128_si256(m11m10_256,m10_128,1);
m_b02 = _mm256_adds_epi16(b45,m11m10_256); //negative m10
m_b13 = _mm256_subs_epi16(b45,m11m10_256); //negative m10
m_b64 = _mm256_subs_epi16(b67,m11m10_256); //negative m10
m_b75 = _mm256_adds_epi16(b67,m11m10_256); //negative m10
new02 = _mm256_subs_epi16(b01,m11m10_256); //negative m10
new13 = _mm256_adds_epi16(b01,m11m10_256); //negative m10
new64 = _mm256_adds_epi16(b23,m11m10_256); //negative m10
new75 = _mm256_subs_epi16(b24,m11m10_256); //negative m10
#endif
beta_ptr-=8; beta_ptr-=8;
//#ifndef __AVX2__
#if 1
beta_ptr[0] = _mm_max_epi16(m_b0,new0); beta_ptr[0] = _mm_max_epi16(m_b0,new0);
beta_ptr[1] = _mm_max_epi16(m_b1,new1); beta_ptr[1] = _mm_max_epi16(m_b1,new1);
beta_ptr[2] = _mm_max_epi16(m_b2,new2); beta_ptr[2] = _mm_max_epi16(m_b2,new2);
...@@ -634,6 +806,28 @@ void compute_beta16(llr_t* alpha,llr_t* beta,llr_t *m_11,llr_t* m_10,unsigned sh ...@@ -634,6 +806,28 @@ void compute_beta16(llr_t* alpha,llr_t* beta,llr_t *m_11,llr_t* m_10,unsigned sh
beta_ptr[5] = _mm_subs_epi16(beta_ptr[5],beta_max); beta_ptr[5] = _mm_subs_epi16(beta_ptr[5],beta_max);
beta_ptr[6] = _mm_subs_epi16(beta_ptr[6],beta_max); beta_ptr[6] = _mm_subs_epi16(beta_ptr[6],beta_max);
beta_ptr[7] = _mm_subs_epi16(beta_ptr[7],beta_max); beta_ptr[7] = _mm_subs_epi16(beta_ptr[7],beta_max);
#else
b02 = _mm256_max_epi16(m_b02,new02);
b13 = _mm256_max_epi16(m_b13,new13);
b64 = _mm256_max_epi16(m_b64,new64);
b75 = _mm256_max_epi16(m_b75,new75);
beta_max = _mm256_max_epi16(b02,b13);
beta_max = _mm256_max_epi16(beta_max,b64);
beta_max = _mm256_max_epi16(beta_max,b75);
beta_max = _mm256_max_epi16(beta_max,_mm256_permutevar8x32_epi32(betaa_max,_mm256_set_epi32(3,2,1,0,7,6,5,4)));
b02 = _mm256_subs_epi16(b02,beta_max);
b13 = _mm256_subs_epi16(b13,beta_max);
b64 = _mm256_subs_epi16(b64,beta_max);
b75 = _mm256_subs_epi16(b75,beta_max);
((_m256i*)beta_ptr)[0]) = _mm256_permute2x128_si256(b02,b13,0x02); //b01
((_m256i*)beta_ptr)[1]) = _mm256_permute2x128_si256(b02,b13,0x31); //b23
((_m256i*)beta_ptr)[2]) = _mm256_permute2x128_si256(b64,b75,0x13); //b45
((_m256i*)beta_ptr)[3]) = _mm256_permute2x128_si256(b64,b75,0x20); //b67
#endif
#elif defined(__arm__) #elif defined(__arm__)
m11_128=((int16x8_t*)m_11)[k]; m11_128=((int16x8_t*)m_11)[k];
m10_128=((int16x8_t*)m_10)[k]; m10_128=((int16x8_t*)m_10)[k];
...@@ -684,6 +878,18 @@ void compute_beta16(llr_t* alpha,llr_t* beta,llr_t *m_11,llr_t* m_10,unsigned sh ...@@ -684,6 +878,18 @@ void compute_beta16(llr_t* alpha,llr_t* beta,llr_t *m_11,llr_t* m_10,unsigned sh
beta_ptr[7] = vqsubq_s16(beta_ptr[7],beta_max); beta_ptr[7] = vqsubq_s16(beta_ptr[7],beta_max);
#endif #endif
#ifdef DEBUG_LOGMAP
fprintf(fdsse4,"Loop index %d, mb\n",k);
fprintf(fdsse4,"beta init (after max)\n");
print_shorts("b0",(int16_t*)&beta_ptr[0]);
print_shorts("b1",(int16_t*)&beta_ptr[1]);
print_shorts("b2",(int16_t*)&beta_ptr[2]);
print_shorts("b3",(int16_t*)&beta_ptr[3]);
print_shorts("b4",(int16_t*)&beta_ptr[4]);
print_shorts("b5",(int16_t*)&beta_ptr[5]);
print_shorts("b6",(int16_t*)&beta_ptr[6]);
print_shorts("b7",(int16_t*)&beta_ptr[7]);
#endif
} }
...@@ -721,7 +927,7 @@ void compute_ext16(llr_t* alpha,llr_t* beta,llr_t* m_11,llr_t* m_10,llr_t* ext, ...@@ -721,7 +927,7 @@ void compute_ext16(llr_t* alpha,llr_t* beta,llr_t* m_11,llr_t* m_10,llr_t* ext,
// //
#ifdef DEBUG_LOGMAP #ifdef DEBUG_LOGMAP
msg("compute_ext, %p, %p, %p, %p, %p, %p ,framelength %d\n",alpha,beta,m_11,m_10,ext,systematic,frame_length); fprintf(fdsse4,"compute_ext (sse_16bit), %p, %p, %p, %p, %p, %p ,framelength %d\n",alpha,beta,m_11,m_10,ext,systematic,frame_length);
#endif #endif
alpha_ptr = alpha128; alpha_ptr = alpha128;
...@@ -736,7 +942,7 @@ void compute_ext16(llr_t* alpha,llr_t* beta,llr_t* m_11,llr_t* m_10,llr_t* ext, ...@@ -736,7 +942,7 @@ void compute_ext16(llr_t* alpha,llr_t* beta,llr_t* m_11,llr_t* m_10,llr_t* ext,
ext_128 = (__m128i*)&ext[k<<3]; ext_128 = (__m128i*)&ext[k<<3];
/* /*
printf("EXT %03d\n",k); fprintf(fdsse4,"EXT %03d\n",k);
print_shorts("a0:",&alpha_ptr[0]); print_shorts("a0:",&alpha_ptr[0]);
print_shorts("a1:",&alpha_ptr[1]); print_shorts("a1:",&alpha_ptr[1]);
print_shorts("a2:",&alpha_ptr[2]); print_shorts("a2:",&alpha_ptr[2]);
...@@ -754,6 +960,9 @@ void compute_ext16(llr_t* alpha,llr_t* beta,llr_t* m_11,llr_t* m_10,llr_t* ext, ...@@ -754,6 +960,9 @@ void compute_ext16(llr_t* alpha,llr_t* beta,llr_t* m_11,llr_t* m_10,llr_t* ext,
print_shorts("b6:",&beta_ptr[6]); print_shorts("b6:",&beta_ptr[6]);
print_shorts("b7:",&beta_ptr[7]); print_shorts("b7:",&beta_ptr[7]);
*/ */
//#ifndef __AVX2__
#if 1
m00_4 = _mm_adds_epi16(alpha_ptr[7],beta_ptr[3]); //ALPHA_BETA_4m00; m00_4 = _mm_adds_epi16(alpha_ptr[7],beta_ptr[3]); //ALPHA_BETA_4m00;
m11_4 = _mm_adds_epi16(alpha_ptr[7],beta_ptr[7]); //ALPHA_BETA_4m11; m11_4 = _mm_adds_epi16(alpha_ptr[7],beta_ptr[7]); //ALPHA_BETA_4m11;
m00_3 = _mm_adds_epi16(alpha_ptr[6],beta_ptr[7]); //ALPHA_BETA_3m00; m00_3 = _mm_adds_epi16(alpha_ptr[6],beta_ptr[7]); //ALPHA_BETA_3m00;
...@@ -770,6 +979,32 @@ void compute_ext16(llr_t* alpha,llr_t* beta,llr_t* m_11,llr_t* m_10,llr_t* ext, ...@@ -770,6 +979,32 @@ void compute_ext16(llr_t* alpha,llr_t* beta,llr_t* m_11,llr_t* m_10,llr_t* ext,
m10_2 = _mm_adds_epi16(alpha_ptr[3],beta_ptr[5]); //ALPHA_BETA_2m10; m10_2 = _mm_adds_epi16(alpha_ptr[3],beta_ptr[5]); //ALPHA_BETA_2m10;
m10_1 = _mm_adds_epi16(alpha_ptr[2],beta_ptr[1]); //ALPHA_BETA_1m10; m10_1 = _mm_adds_epi16(alpha_ptr[2],beta_ptr[1]); //ALPHA_BETA_1m10;
m01_1 = _mm_adds_epi16(alpha_ptr[2],beta_ptr[5]); //ALPHA_BETA_1m01; m01_1 = _mm_adds_epi16(alpha_ptr[2],beta_ptr[5]); //ALPHA_BETA_1m01;
#else
m00_1 = _mm_adds_epi16(alpha_ptr[0],beta_ptr[0]); //ALPHA_BETA_1m00;
m10_1 = _mm_adds_epi16(alpha_ptr[2],beta_ptr[1]); //ALPHA_BETA_1m10;
m11_1 = _mm_adds_epi16(alpha_ptr[0],beta_ptr[4]); //ALPHA_BETA_1m11;
m01_1 = _mm_adds_epi16(alpha_ptr[2],beta_ptr[5]); //ALPHA_BETA_1m01;
m11_2 = _mm_adds_epi16(alpha_ptr[1],beta_ptr[0]); //ALPHA_BETA_2m11;
m01_2 = _mm_adds_epi16(alpha_ptr[3],beta_ptr[1]); //ALPHA_BETA_2m01;
m00_2 = _mm_adds_epi16(alpha_ptr[1],beta_ptr[4]); //ALPHA_BETA_2m00;
m10_2 = _mm_adds_epi16(alpha_ptr[3],beta_ptr[5]); //ALPHA_BETA_2m10;
m11_3 = _mm_adds_epi16(alpha_ptr[6],beta_ptr[3]); //ALPHA_BETA_3m11;
m01_3 = _mm_adds_epi16(alpha_ptr[4],beta_ptr[2]); //ALPHA_BETA_3m01;
m00_3 = _mm_adds_epi16(alpha_ptr[6],beta_ptr[7]); //ALPHA_BETA_3m00;
m10_3 = _mm_adds_epi16(alpha_ptr[4],beta_ptr[6]); //ALPHA_BETA_3m10;
m00_4 = _mm_adds_epi16(alpha_ptr[7],beta_ptr[3]); //ALPHA_BETA_4m00;
m10_4 = _mm_adds_epi16(alpha_ptr[5],beta_ptr[2]); //ALPHA_BETA_4m10;
m11_4 = _mm_adds_epi16(alpha_ptr[7],beta_ptr[7]); //ALPHA_BETA_4m11;
m01_4 = _mm_adds_epi16(alpha_ptr[5],beta_ptr[6]); //ALPHA_BETA_4m01;
#endif
/* /*
print_shorts("m11_1:",&m11_1); print_shorts("m11_1:",&m11_1);
print_shorts("m11_2:",&m11_2); print_shorts("m11_2:",&m11_2);
...@@ -816,15 +1051,15 @@ void compute_ext16(llr_t* alpha,llr_t* beta,llr_t* m_11,llr_t* m_10,llr_t* ext, ...@@ -816,15 +1051,15 @@ void compute_ext16(llr_t* alpha,llr_t* beta,llr_t* m_11,llr_t* m_10,llr_t* ext,
// print_shorts("m10_1:",&m10_1); // print_shorts("m10_1:",&m10_1);
*ext_128 = _mm_subs_epi16(m10_1,m01_1); *ext_128 = _mm_subs_epi16(m10_1,m01_1);
#ifdef DEBUG_LOGMAP
fprintf(fdsse4,"ext %p\n",ext_128);
print_shorts("ext:",(int16_t*)ext_128);
print_shorts("m11:",(int16_t*)m11_128);
print_shorts("m10:",(int16_t*)m10_128);
print_shorts("m10_1:",(int16_t*)&m10_1);
print_shorts("m01_1:",(int16_t*)&m01_1);
#endif
/*
print_shorts("ext:",ext_128);
print_shorts("m11:",m11_128);
print_shorts("m10:",m10_128);
print_shorts("m10_1:",&m10_1);
print_shorts("m01_1:",&m01_1);
print_shorts("syst:",systematic_128);
*/
#elif defined(__arm__) #elif defined(__arm__)
m11_128 = (int16x8_t*)&m_11[k<<3]; m11_128 = (int16x8_t*)&m_11[k<<3];
m10_128 = (int16x8_t*)&m_10[k<<3]; m10_128 = (int16x8_t*)&m_10[k<<3];
...@@ -927,7 +1162,7 @@ void init_td16() ...@@ -927,7 +1162,7 @@ void init_td16()
// j-=(n-1); // j-=(n-1);
pi2tab16[ind][i] = j; pi2tab16[ind][i] = j;
// printf("pi2[%d] = %d\n",i,j); // fprintf(fdsse4,"pi2[%d] = %d\n",i,j);
} }
} }
...@@ -964,19 +1199,19 @@ unsigned char phy_threegpplte_turbo_decoder16(short *y, ...@@ -964,19 +1199,19 @@ unsigned char phy_threegpplte_turbo_decoder16(short *y,
n is the size in bits of the coded block, with the tail */ n is the size in bits of the coded block, with the tail */
llr_t systematic0[n+16] __attribute__ ((aligned(16))); llr_t systematic0[n+16] __attribute__ ((aligned(32)));
llr_t systematic1[n+16] __attribute__ ((aligned(16))); llr_t systematic1[n+16] __attribute__ ((aligned(32)));
llr_t systematic2[n+16] __attribute__ ((aligned(16))); llr_t systematic2[n+16] __attribute__ ((aligned(32)));
llr_t yparity1[n+16] __attribute__ ((aligned(16))); llr_t yparity1[n+16] __attribute__ ((aligned(32)));
llr_t yparity2[n+16] __attribute__ ((aligned(16))); llr_t yparity2[n+16] __attribute__ ((aligned(32)));
llr_t ext[n+128] __attribute__((aligned(16))); llr_t ext[n+128] __attribute__((aligned(32)));
llr_t ext2[n+128] __attribute__((aligned(16))); llr_t ext2[n+128] __attribute__((aligned(32)));
llr_t alpha[(n+16)*8] __attribute__ ((aligned(16))); llr_t alpha[(n+16)*8] __attribute__ ((aligned(32)));
llr_t beta[(n+16)*8] __attribute__ ((aligned(16))); llr_t beta[(n+16)*8] __attribute__ ((aligned(32)));
llr_t m11[n+16] __attribute__ ((aligned(16))); llr_t m11[n+32] __attribute__ ((aligned(32)));
llr_t m10[n+16] __attribute__ ((aligned(16))); llr_t m10[n+32] __attribute__ ((aligned(32)));
int *pi2_p,*pi4_p,*pi5_p,*pi6_p; int *pi2_p,*pi4_p,*pi5_p,*pi6_p;
...@@ -989,7 +1224,7 @@ unsigned char phy_threegpplte_turbo_decoder16(short *y, ...@@ -989,7 +1224,7 @@ unsigned char phy_threegpplte_turbo_decoder16(short *y,
#if defined(__x86_64__) || defined(__i386__) #if defined(__x86_64__) || defined(__i386__)
__m128i *yp128; __m128i *yp128;
__m128i tmp, zeros=_mm_setzero_si128(); __m128i tmp, zeros=_mm_setzero_si128();
register __m128i tmpe; __m128i tmpe;
#elif defined(__arm__) #elif defined(__arm__)
int16x8_t *yp128; int16x8_t *yp128;
// int16x8_t tmp128[(n+8)>>3]; // int16x8_t tmp128[(n+8)>>3];
...@@ -1000,12 +1235,20 @@ unsigned char phy_threegpplte_turbo_decoder16(short *y, ...@@ -1000,12 +1235,20 @@ unsigned char phy_threegpplte_turbo_decoder16(short *y,
#endif #endif
int offset8_flag=0; int offset8_flag=0;
#ifdef DEBUG_LOGMAP
fdsse4 = fopen("dump_sse4.txt","w");
printf("tc sse4_16 (y) %p\n",y);
#endif
if (crc_type > 3) { if (crc_type > 3) {
msg("Illegal crc length!\n"); printf("Illegal crc length!\n");
return 255; return 255;
} }
start_meas(init_stats); start_meas(init_stats);
...@@ -1013,7 +1256,7 @@ unsigned char phy_threegpplte_turbo_decoder16(short *y, ...@@ -1013,7 +1256,7 @@ unsigned char phy_threegpplte_turbo_decoder16(short *y,
for (iind=0; iind < 188 && f1f2mat[iind].nb_bits != n; iind++); for (iind=0; iind < 188 && f1f2mat[iind].nb_bits != n; iind++);
if ( iind == 188 ) { if ( iind == 188 ) {
msg("Illegal frame length!\n"); printf("Illegal frame length!\n");
return 255; return 255;
} }
...@@ -1059,62 +1302,74 @@ unsigned char phy_threegpplte_turbo_decoder16(short *y, ...@@ -1059,62 +1302,74 @@ unsigned char phy_threegpplte_turbo_decoder16(short *y,
#if defined(__x86_64__) || defined(__i386__) #if defined(__x86_64__) || defined(__i386__)
tmpe = _mm_load_si128(yp128); tmpe = _mm_load_si128(yp128);
// fprintf(fdsse4,"yp128 %p\n",yp128);
// print_shorts("tmpe",(int16_t *)&tmpe);
s[j] = _mm_extract_epi16(tmpe,0); s[j] = _mm_extract_epi16(tmpe,0);
yp1[j] = _mm_extract_epi16(tmpe,1); yp1[j] = _mm_extract_epi16(tmpe,1);
yp2[j] = _mm_extract_epi16(tmpe,2); yp2[j] = _mm_extract_epi16(tmpe,2);
// printf("init: j %d, s[j] %d yp1[j] %d yp2[j] %d\n",j,s[j],yp1[j],yp2[j]); #ifdef DEBUG_LOGMAP
fprintf(fdsse4,"init0: j %d, s[j] %d yp1[j] %d yp2[j] %d\n",j,s[j],yp1[j],yp2[j]);
#endif
j=pi2_p[1]; j=pi2_p[1];
s[j] = _mm_extract_epi16(tmpe,3); s[j] = _mm_extract_epi16(tmpe,3);
yp1[j] = _mm_extract_epi16(tmpe,4); yp1[j] = _mm_extract_epi16(tmpe,4);
yp2[j] = _mm_extract_epi16(tmpe,5); yp2[j] = _mm_extract_epi16(tmpe,5);
// printf("init: j %d, s[j] %d yp1[j] %d yp2[j] %d\n",j,s[j],yp1[j],yp2[j]); #ifdef DEBUG_LOGMAP
fprintf(fdsse4,"init1: j %d, s[j] %d yp1[j] %d yp2[j] %d\n",j,s[j],yp1[j],yp2[j]);
#endif
j=pi2_p[2]; j=pi2_p[2];
s[j] = _mm_extract_epi16(tmpe,6); s[j] = _mm_extract_epi16(tmpe,6);
yp1[j] = _mm_extract_epi16(tmpe,7); yp1[j] = _mm_extract_epi16(tmpe,7);
tmpe = _mm_load_si128(&yp128[1]); tmpe = _mm_load_si128(&yp128[1]);
yp2[j] = _mm_extract_epi16(tmpe,0); yp2[j] = _mm_extract_epi16(tmpe,0);
// printf("init: j %d, s[j] %d yp1[j] %d yp2[j] %d\n",j,s[j],yp1[j],yp2[j]); #ifdef DEBUG_LOGMAP
fprintf(fdsse4,"init2: j %d, s[j] %d yp1[j] %d yp2[j] %d\n",j,s[j],yp1[j],yp2[j]);
#endif
j=pi2_p[3]; j=pi2_p[3];
s[j] = _mm_extract_epi16(tmpe,1); s[j] = _mm_extract_epi16(tmpe,1);
yp1[j] = _mm_extract_epi16(tmpe,2); yp1[j] = _mm_extract_epi16(tmpe,2);
yp2[j] = _mm_extract_epi16(tmpe,3); yp2[j] = _mm_extract_epi16(tmpe,3);
// printf("init: j %d, s[j] %d yp1[j] %d yp2[j] %d\n",j,s[j],yp1[j],yp2[j]); #ifdef DEBUG_LOGMAP
fprintf(fdsse4,"init3: j %d, s[j] %d yp1[j] %d yp2[j] %d\n",j,s[j],yp1[j],yp2[j]);
#endif
j=pi2_p[4]; j=pi2_p[4];
s[j] = _mm_extract_epi16(tmpe,4); s[j] = _mm_extract_epi16(tmpe,4);
yp1[j] = _mm_extract_epi16(tmpe,5); yp1[j] = _mm_extract_epi16(tmpe,5);
yp2[j] = _mm_extract_epi16(tmpe,6); yp2[j] = _mm_extract_epi16(tmpe,6);
// printf("init: j %d, s[j] %d yp1[j] %d yp2[j] %d\n",j,s[j],yp1[j],yp2[j]); #ifdef DEBUG_LOGMAP
fprintf(fdsse4,"init4: j %d, s[j] %d yp1[j] %d yp2[j] %d\n",j,s[j],yp1[j],yp2[j]);
#endif
j=pi2_p[5]; j=pi2_p[5];
s[j] = _mm_extract_epi16(tmpe,7); s[j] = _mm_extract_epi16(tmpe,7);
tmpe = _mm_load_si128(&yp128[2]); tmpe = _mm_load_si128(&yp128[2]);
yp1[j] = _mm_extract_epi16(tmpe,0); yp1[j] = _mm_extract_epi16(tmpe,0);
yp2[j] = _mm_extract_epi16(tmpe,1); yp2[j] = _mm_extract_epi16(tmpe,1);
// printf("init: j %d, s[j] %d yp1[j] %d yp2[j] %d\n",j,s[j],yp1[j],yp2[j]); #ifdef DEBUG_LOGMAP
fprintf(fdsse4,"init5: j %d, s[j] %d yp1[j] %d yp2[j] %d\n",j,s[j],yp1[j],yp2[j]);
#endif
j=pi2_p[6]; j=pi2_p[6];
s[j] = _mm_extract_epi16(tmpe,2); s[j] = _mm_extract_epi16(tmpe,2);
yp1[j] = _mm_extract_epi16(tmpe,3); yp1[j] = _mm_extract_epi16(tmpe,3);
yp2[j] = _mm_extract_epi16(tmpe,4); yp2[j] = _mm_extract_epi16(tmpe,4);
// printf("init: j %d, s[j] %d yp1[j] %d yp2[j] %d\n",j,s[j],yp1[j],yp2[j]); #ifdef DEBUG_LOGMAP
fprintf(fdsse4,"init6: j %d, s[j] %d yp1[j] %d yp2[j] %d\n",j,s[j],yp1[j],yp2[j]);
#endif
j=pi2_p[7]; j=pi2_p[7];
s[j] = _mm_extract_epi16(tmpe,5); s[j] = _mm_extract_epi16(tmpe,5);
yp1[j] = _mm_extract_epi16(tmpe,6); yp1[j] = _mm_extract_epi16(tmpe,6);
yp2[j] = _mm_extract_epi16(tmpe,7); yp2[j] = _mm_extract_epi16(tmpe,7);
// printf("init: j %d, s[j] %d yp1[j] %d yp2[j] %d\n",j,s[j],yp1[j],yp2[j]); #ifdef DEBUG_LOGMAP
fprintf(fdsse4,"init7: j %d, s[j] %d yp1[j] %d yp2[j] %d\n",j,s[j],yp1[j],yp2[j]);
#endif
#elif defined(__arm__) #elif defined(__arm__)
s[j] = vgetq_lane_s16(yp128[0],0); s[j] = vgetq_lane_s16(yp128[0],0);
...@@ -1172,7 +1427,7 @@ unsigned char phy_threegpplte_turbo_decoder16(short *y, ...@@ -1172,7 +1427,7 @@ unsigned char phy_threegpplte_turbo_decoder16(short *y,
yp1[i] = *yp; yp1[i] = *yp;
yp++; yp++;
#ifdef DEBUG_LOGMAP #ifdef DEBUG_LOGMAP
msg("Term 1 (%d): %d %d\n",i,s[i],yp1[i]); fprintf(fdsse4,"Term 1 (%d): %d %d\n",i,s[i],yp1[i]);
#endif //DEBUG_LOGMAP #endif //DEBUG_LOGMAP
} }
...@@ -1184,12 +1439,12 @@ unsigned char phy_threegpplte_turbo_decoder16(short *y, ...@@ -1184,12 +1439,12 @@ unsigned char phy_threegpplte_turbo_decoder16(short *y,
yp2[i-8] = *yp; yp2[i-8] = *yp;
yp++; yp++;
#ifdef DEBUG_LOGMAP #ifdef DEBUG_LOGMAP
msg("Term 2 (%d): %d %d\n",i-3,s[i],yp2[i-8]); fprintf(fdsse4,"Term 2 (%d): %d %d\n",i-3,s[i],yp2[i-8]);
#endif //DEBUG_LOGMAP #endif //DEBUG_LOGMAP
} }
#ifdef DEBUG_LOGMAP #ifdef DEBUG_LOGMAP
msg("\n"); fprintf(fdsse4,"\n");
#endif //DEBUG_LOGMAP #endif //DEBUG_LOGMAP
stop_meas(init_stats); stop_meas(init_stats);
...@@ -1201,7 +1456,7 @@ unsigned char phy_threegpplte_turbo_decoder16(short *y, ...@@ -1201,7 +1456,7 @@ unsigned char phy_threegpplte_turbo_decoder16(short *y,
while (iteration_cnt++ < max_iterations) { while (iteration_cnt++ < max_iterations) {
#ifdef DEBUG_LOGMAP #ifdef DEBUG_LOGMAP
printf("\n*******************ITERATION %d (n %d), ext %p\n\n",iteration_cnt,n,ext); fprintf(fdsse4,"\n*******************ITERATION %d (n %d), ext %p\n\n",iteration_cnt,n,ext);
#endif //DEBUG_LOGMAP #endif //DEBUG_LOGMAP
start_meas(intl1_stats); start_meas(intl1_stats);
...@@ -1209,24 +1464,29 @@ unsigned char phy_threegpplte_turbo_decoder16(short *y, ...@@ -1209,24 +1464,29 @@ unsigned char phy_threegpplte_turbo_decoder16(short *y,
pi4_p=pi4tab16[iind]; pi4_p=pi4tab16[iind];
for (i=0; i<(n>>3); i++) { // steady-state portion for (i=0; i<(n>>3); i++) { // steady-state portion
#if defined(__x86_64__) || defined(__i386__) #if defined(__x86_64__) || defined(__i386__)
((__m128i *)systematic2)[i]=_mm_insert_epi16(((__m128i *)systematic2)[i],((llr_t*)ext)[*pi4_p++],0); ((__m128i *)systematic2)[i]=_mm_insert_epi16(((__m128i *)systematic2)[i],ext[*pi4_p++],0);
((__m128i *)systematic2)[i]=_mm_insert_epi16(((__m128i *)systematic2)[i],((llr_t*)ext)[*pi4_p++],1); ((__m128i *)systematic2)[i]=_mm_insert_epi16(((__m128i *)systematic2)[i],ext[*pi4_p++],1);
((__m128i *)systematic2)[i]=_mm_insert_epi16(((__m128i *)systematic2)[i],((llr_t*)ext)[*pi4_p++],2); ((__m128i *)systematic2)[i]=_mm_insert_epi16(((__m128i *)systematic2)[i],ext[*pi4_p++],2);
((__m128i *)systematic2)[i]=_mm_insert_epi16(((__m128i *)systematic2)[i],((llr_t*)ext)[*pi4_p++],3); ((__m128i *)systematic2)[i]=_mm_insert_epi16(((__m128i *)systematic2)[i],ext[*pi4_p++],3);
((__m128i *)systematic2)[i]=_mm_insert_epi16(((__m128i *)systematic2)[i],((llr_t*)ext)[*pi4_p++],4); ((__m128i *)systematic2)[i]=_mm_insert_epi16(((__m128i *)systematic2)[i],ext[*pi4_p++],4);
((__m128i *)systematic2)[i]=_mm_insert_epi16(((__m128i *)systematic2)[i],((llr_t*)ext)[*pi4_p++],5); ((__m128i *)systematic2)[i]=_mm_insert_epi16(((__m128i *)systematic2)[i],ext[*pi4_p++],5);
((__m128i *)systematic2)[i]=_mm_insert_epi16(((__m128i *)systematic2)[i],((llr_t*)ext)[*pi4_p++],6); ((__m128i *)systematic2)[i]=_mm_insert_epi16(((__m128i *)systematic2)[i],ext[*pi4_p++],6);
((__m128i *)systematic2)[i]=_mm_insert_epi16(((__m128i *)systematic2)[i],((llr_t*)ext)[*pi4_p++],7); ((__m128i *)systematic2)[i]=_mm_insert_epi16(((__m128i *)systematic2)[i],ext[*pi4_p++],7);
#elif defined(__arm__) #elif defined(__arm__)
((int16x8_t*)systematic2)[i]=vsetq_lane_s16(((llr_t*)ext)[*pi4_p++],((int16x8_t*)systematic2)[i],0); ((int16x8_t*)systematic2)[i]=vsetq_lane_s16(ext[*pi4_p++],((int16x8_t*)systematic2)[i],0);
((int16x8_t*)systematic2)[i]=vsetq_lane_s16(((llr_t*)ext)[*pi4_p++],((int16x8_t*)systematic2)[i],1); ((int16x8_t*)systematic2)[i]=vsetq_lane_s16(ext[*pi4_p++],((int16x8_t*)systematic2)[i],1);
((int16x8_t*)systematic2)[i]=vsetq_lane_s16(((llr_t*)ext)[*pi4_p++],((int16x8_t*)systematic2)[i],2); ((int16x8_t*)systematic2)[i]=vsetq_lane_s16(ext[*pi4_p++],((int16x8_t*)systematic2)[i],2);
((int16x8_t*)systematic2)[i]=vsetq_lane_s16(((llr_t*)ext)[*pi4_p++],((int16x8_t*)systematic2)[i],3); ((int16x8_t*)systematic2)[i]=vsetq_lane_s16(ext[*pi4_p++],((int16x8_t*)systematic2)[i],3);
((int16x8_t*)systematic2)[i]=vsetq_lane_s16(((llr_t*)ext)[*pi4_p++],((int16x8_t*)systematic2)[i],4); ((int16x8_t*)systematic2)[i]=vsetq_lane_s16(ext[*pi4_p++],((int16x8_t*)systematic2)[i],4);
((int16x8_t*)systematic2)[i]=vsetq_lane_s16(((llr_t*)ext)[*pi4_p++],((int16x8_t*)systematic2)[i],5); ((int16x8_t*)systematic2)[i]=vsetq_lane_s16(ext[*pi4_p++],((int16x8_t*)systematic2)[i],5);
((int16x8_t*)systematic2)[i]=vsetq_lane_s16(((llr_t*)ext)[*pi4_p++],((int16x8_t*)systematic2)[i],6); ((int16x8_t*)systematic2)[i]=vsetq_lane_s16(ext[*pi4_p++],((int16x8_t*)systematic2)[i],6);
((int16x8_t*)systematic2)[i]=vsetq_lane_s16(((llr_t*)ext)[*pi4_p++],((int16x8_t*)systematic2)[i],7); ((int16x8_t*)systematic2)[i]=vsetq_lane_s16(ext[*pi4_p++],((int16x8_t*)systematic2)[i],7);
#endif
#ifdef DEBUG_LOGMAP
print_shorts("syst2",(int16_t*)&((__m128i *)systematic2)[i]);
#endif #endif
} }
...@@ -1261,6 +1521,9 @@ unsigned char phy_threegpplte_turbo_decoder16(short *y, ...@@ -1261,6 +1521,9 @@ unsigned char phy_threegpplte_turbo_decoder16(short *y,
tmp=vsetq_lane_s16(ext2[*pi5_p++],tmp,6); tmp=vsetq_lane_s16(ext2[*pi5_p++],tmp,6);
tmp=vsetq_lane_s16(ext2[*pi5_p++],tmp,7); tmp=vsetq_lane_s16(ext2[*pi5_p++],tmp,7);
((int16x8_t *)systematic1)[i] = vqaddq_s16(vqsubq_s16(tmp,((int16x8_t*)ext)[i]),((int16x8_t *)systematic0)[i]); ((int16x8_t *)systematic1)[i] = vqaddq_s16(vqsubq_s16(tmp,((int16x8_t*)ext)[i]),((int16x8_t *)systematic0)[i]);
#endif
#ifdef DEBUG_LOGMAP
print_shorts("syst1",(int16_t*)&((__m128i *)systematic1)[i]);
#endif #endif
} }
...@@ -1278,6 +1541,9 @@ unsigned char phy_threegpplte_turbo_decoder16(short *y, ...@@ -1278,6 +1541,9 @@ unsigned char phy_threegpplte_turbo_decoder16(short *y,
tmp=_mm_insert_epi16(tmp, ((llr_t*)ext2)[*pi6_p++],2); tmp=_mm_insert_epi16(tmp, ((llr_t*)ext2)[*pi6_p++],2);
tmp=_mm_insert_epi16(tmp, ((llr_t*)ext2)[*pi6_p++],1); tmp=_mm_insert_epi16(tmp, ((llr_t*)ext2)[*pi6_p++],1);
tmp=_mm_insert_epi16(tmp, ((llr_t*)ext2)[*pi6_p++],0); tmp=_mm_insert_epi16(tmp, ((llr_t*)ext2)[*pi6_p++],0);
#ifdef DEBUG_LOGMAP
print_shorts("tmp",(int16_t*)&tmp);
#endif
tmp=_mm_cmpgt_epi8(_mm_packs_epi16(tmp,zeros),zeros); tmp=_mm_cmpgt_epi8(_mm_packs_epi16(tmp,zeros),zeros);
decoded_bytes[i]=(unsigned char)_mm_movemask_epi8(tmp); decoded_bytes[i]=(unsigned char)_mm_movemask_epi8(tmp);
#elif defined(__arm__) #elif defined(__arm__)
...@@ -1297,6 +1563,10 @@ unsigned char phy_threegpplte_turbo_decoder16(short *y, ...@@ -1297,6 +1563,10 @@ unsigned char phy_threegpplte_turbo_decoder16(short *y,
uint64x2_t Mask = vpaddlq_u32(vpaddlq_u16(vandq_u16(vcgtq_s16(tmp,zeros), Powers))); uint64x2_t Mask = vpaddlq_u32(vpaddlq_u16(vandq_u16(vcgtq_s16(tmp,zeros), Powers)));
uint64x1_t Mask64 = vget_high_u64(Mask)+vget_low_u64(Mask); uint64x1_t Mask64 = vget_high_u64(Mask)+vget_low_u64(Mask);
decoded_bytes[i] = (uint8_t)Mask64; decoded_bytes[i] = (uint8_t)Mask64;
#endif
#ifdef DEBUG_LOGMAP
print_shorts("tmp",(int16_t*)&tmp);
fprintf(fdsse4,"decoded_bytes[%d] %x\n",i,decoded_bytes[i]);
#endif #endif
} }
} }
...@@ -1344,6 +1614,9 @@ unsigned char phy_threegpplte_turbo_decoder16(short *y, ...@@ -1344,6 +1614,9 @@ unsigned char phy_threegpplte_turbo_decoder16(short *y,
} }
stop_meas(intl2_stats); stop_meas(intl2_stats);
#ifdef DEBUG_LOGMAP
fprintf(fdsse4,"oldcrc %x, crc %x\n",oldcrc,crc);
#endif
if ((crc == oldcrc) && (crc!=0)) { if ((crc == oldcrc) && (crc!=0)) {
return(iteration_cnt); return(iteration_cnt);
...@@ -1374,8 +1647,12 @@ unsigned char phy_threegpplte_turbo_decoder16(short *y, ...@@ -1374,8 +1647,12 @@ unsigned char phy_threegpplte_turbo_decoder16(short *y,
} }
} }
} }
// fprintf(fdsse4,"crc %x, oldcrc %x\n",crc,oldcrc);
// printf("crc %x, oldcrc %x\n",crc,oldcrc); #ifdef DEBUG_LOGMAP
fclose(fdsse4);
#endif
#if defined(__x86_64__) || defined(__i386__) #if defined(__x86_64__) || defined(__i386__)
_mm_empty(); _mm_empty();
......
...@@ -5,7 +5,7 @@ RATE12CC_SRC = ccoding_byte.c viterbi.c crc_byte.c ...@@ -5,7 +5,7 @@ RATE12CC_SRC = ccoding_byte.c viterbi.c crc_byte.c
all: 3gpplte_sse all: 3gpplte_sse
3gpplte_sse: $(TURBO_SRC) 3gpplte_sse: $(TURBO_SRC)
gcc -o 3gpplte_sse 3gpplte_sse.c -msse4 -Wall -g -ggdb -DMAIN gcc -o 3gpplte_sse 3gpplte_sse.c -msse4 -Wall -g -ggdb -DTC_MAIN -I../..
......
...@@ -483,6 +483,24 @@ uint8_t phy_threegpplte_turbo_decoder16(int16_t *y, ...@@ -483,6 +483,24 @@ uint8_t phy_threegpplte_turbo_decoder16(int16_t *y,
time_stats_t *intl1_stats, time_stats_t *intl1_stats,
time_stats_t *intl2_stats); time_stats_t *intl2_stats);
uint8_t phy_threegpplte_turbo_decoder16avx2(int16_t *y,
int16_t *y2,
uint8_t *decoded_bytes,
uint8_t *decoded_bytes2,
uint16_t n,
uint16_t interleaver_f1,
uint16_t interleaver_f2,
uint8_t max_iterations,
uint8_t crc_type,
uint8_t F,
time_stats_t *init_stats,
time_stats_t *alpha_stats,
time_stats_t *beta_stats,
time_stats_t *gamma_stats,
time_stats_t *ext_stats,
time_stats_t *intl1_stats,
time_stats_t *intl2_stats);
/*! /*!
\brief This routine performs max-logmap detection for the 3GPP turbo code (with termination). It is optimized for SIMD processing and 8-bit \brief This routine performs max-logmap detection for the 3GPP turbo code (with termination). It is optimized for SIMD processing and 8-bit
LLR arithmetic, and requires SSE2,SSSE3 and SSE4.1 (gcc >=4.3 and appropriate CPU) LLR arithmetic, and requires SSE2,SSSE3 and SSE4.1 (gcc >=4.3 and appropriate CPU)
......
...@@ -888,12 +888,16 @@ void phy_init_lte_top(LTE_DL_FRAME_PARMS *lte_frame_parms) ...@@ -888,12 +888,16 @@ void phy_init_lte_top(LTE_DL_FRAME_PARMS *lte_frame_parms)
ccodelte_init(); ccodelte_init();
ccodelte_init_inv(); ccodelte_init_inv();
treillis_table_init();
phy_generate_viterbi_tables(); phy_generate_viterbi_tables();
phy_generate_viterbi_tables_lte(); phy_generate_viterbi_tables_lte();
init_td8(); init_td8();
init_td16(); init_td16();
#ifdef __AVX2__
init_td16avx2();
#endif
lte_sync_time_init(lte_frame_parms); lte_sync_time_init(lte_frame_parms);
......
...@@ -26,187 +26,187 @@ ...@@ -26,187 +26,187 @@
Address : Eurecom, Campus SophiaTech, 450 Route des Chappes, CS 50193 - 06904 Biot Sophia Antipolis cedex, FRANCE Address : Eurecom, Campus SophiaTech, 450 Route des Chappes, CS 50193 - 06904 Biot Sophia Antipolis cedex, FRANCE
*******************************************************************************/ *******************************************************************************/
short filt24_0[24] __attribute__((aligned(16))) ={ short filt24_0[24] __attribute__((aligned(32))) ={
2730,5461,8192,10922,13653,16384,13653,10922,8192,5461,2730,0,0,0,0,0,0,0,0,0,0,0,0,0 2730,5461,8192,10922,13653,16384,13653,10922,8192,5461,2730,0,0,0,0,0,0,0,0,0,0,0,0,0
}; };
short filt24_0_dcl[24] __attribute__((aligned(16))) ={ short filt24_0_dcl[24] __attribute__((aligned(32))) ={
2341,4681,7022,9362,11703,16384,13653,10922,8192,5461,2730,0,0,0,0,0,0,0,0,0,0,0,0,0 2341,4681,7022,9362,11703,16384,13653,10922,8192,5461,2730,0,0,0,0,0,0,0,0,0,0,0,0,0
}; };
short filt24_0_dcr[24] __attribute__((aligned(16))) ={ short filt24_0_dcr[24] __attribute__((aligned(32))) ={
2730,5461,8192,10922,13653,16384,14043,11703,9362,7022,4681,0,0,0,0,0,0,0,0,0,0,0,0 2730,5461,8192,10922,13653,16384,14043,11703,9362,7022,4681,0,0,0,0,0,0,0,0,0,0,0,0
}; };
short filt24_1[24] __attribute__((aligned(16))) ={ short filt24_1[24] __attribute__((aligned(32))) ={
0,2730,5461,8192,10922,13653,16384,13653,10922,8192,5461,2730,0,0,0,0,0,0,0,0,0,0,0,0 0,2730,5461,8192,10922,13653,16384,13653,10922,8192,5461,2730,0,0,0,0,0,0,0,0,0,0,0,0
}; };
short filt24_1_dcl[24] __attribute__((aligned(16))) ={ short filt24_1_dcl[24] __attribute__((aligned(32))) ={
0,4681,7022,9362,11703,14043,16384,13653,10922,8192,5461,2730,0,0,0,0,0,0,0,0,0,0,0,0 0,4681,7022,9362,11703,14043,16384,13653,10922,8192,5461,2730,0,0,0,0,0,0,0,0,0,0,0,0
}; };
short filt24_1_dcr[24] __attribute__((aligned(16))) ={ short filt24_1_dcr[24] __attribute__((aligned(32))) ={
0,2730,5461,8192,10922,13653,16384,14043,11703,9362,7022,4681,0,0,0,0,0,0,0,0,0,0,0,0 0,2730,5461,8192,10922,13653,16384,14043,11703,9362,7022,4681,0,0,0,0,0,0,0,0,0,0,0,0
}; };
short filt24_2[24] __attribute__((aligned(16))) ={ short filt24_2[24] __attribute__((aligned(32))) ={
0,0,2730,5461,8192,10922,13653,16384,13653,10922,8192,5461,2730,0,0,0,0,0,0,0,0,0,0,0 0,0,2730,5461,8192,10922,13653,16384,13653,10922,8192,5461,2730,0,0,0,0,0,0,0,0,0,0,0
}; };
short filt24_2_dcl[24] __attribute__((aligned(16))) ={ short filt24_2_dcl[24] __attribute__((aligned(32))) ={
0,0,2341,4681,7022,9362, 11703,16384,13653,10922,8192,5461,2730,0,0,0,0,0,0,0,0,0,0,0 0,0,2341,4681,7022,9362, 11703,16384,13653,10922,8192,5461,2730,0,0,0,0,0,0,0,0,0,0,0
}; };
short filt24_2_dcr[24] __attribute__((aligned(16))) ={ short filt24_2_dcr[24] __attribute__((aligned(32))) ={
0,0,2730,5461,8192,10922,13653,16384,14043,11703,9362,4681,2341,0,0,0,0,0,0,0,0,0,0,0 0,0,2730,5461,8192,10922,13653,16384,14043,11703,9362,4681,2341,0,0,0,0,0,0,0,0,0,0,0
}; };
// X X X Y | X X X X | X Y X X // X X X Y | X X X X | X Y X X
short filt24_3[24] __attribute__((aligned(16))) ={ short filt24_3[24] __attribute__((aligned(32))) ={
0,0,0,2730,5461,8192,10922,13653,16384,13653,10922,8192,5461,2730,0,0,0,0,0,0,0,0,0,0 0,0,0,2730,5461,8192,10922,13653,16384,13653,10922,8192,5461,2730,0,0,0,0,0,0,0,0,0,0
}; };
short filt24_3_dcl[24] __attribute__((aligned(16))) ={ short filt24_3_dcl[24] __attribute__((aligned(32))) ={
0,0,0,2341,4681,7022,9362,14043,16384,13653,10922,8192,5461,2730,0,0,0,0,0,0,0,0,0,0 0,0,0,2341,4681,7022,9362,14043,16384,13653,10922,8192,5461,2730,0,0,0,0,0,0,0,0,0,0
}; };
// X X X Y | X X DC X X | X Y X X // X X X Y | X X DC X X | X Y X X
short filt24_3_dcr[24] __attribute__((aligned(16))) ={ short filt24_3_dcr[24] __attribute__((aligned(32))) ={
0,0,0,2730,5461,8192,10922,13653,16384,14043,11703,7022,4681,2341,0,0,0,0,0,0,0,0,0,0 0,0,0,2730,5461,8192,10922,13653,16384,14043,11703,7022,4681,2341,0,0,0,0,0,0,0,0,0,0
}; };
short filt24_4[24] __attribute__((aligned(16))) ={ short filt24_4[24] __attribute__((aligned(32))) ={
0,0,0,0,2730,5461,8192,10922,13653,16384,13653,10922,8192,5461,2730,0,0,0,0,0,0,0,0,0 0,0,0,0,2730,5461,8192,10922,13653,16384,13653,10922,8192,5461,2730,0,0,0,0,0,0,0,0,0
}; };
short filt24_4_dcl[24] __attribute__((aligned(16))) ={ short filt24_4_dcl[24] __attribute__((aligned(32))) ={
0,0,0,0,2341,7022,9362,11703,14043,16384,13653,10922,8192,5461,2730,0,0,0,0,0,0,0,0,0 0,0,0,0,2341,7022,9362,11703,14043,16384,13653,10922,8192,5461,2730,0,0,0,0,0,0,0,0,0
}; };
short filt24_4_dcr[24] __attribute__((aligned(16))) ={ short filt24_4_dcr[24] __attribute__((aligned(32))) ={
0,0,0,0,2730,5461,8192,10922,13653,16384,14043,11703,7022,4681,2341,0,0,0,0,0,0,0,0,0 0,0,0,0,2730,5461,8192,10922,13653,16384,14043,11703,7022,4681,2341,0,0,0,0,0,0,0,0,0
}; };
short filt24_5[24] __attribute__((aligned(16))) ={ short filt24_5[24] __attribute__((aligned(32))) ={
0,0,0,0,0,2730,5461,8192,10922,13653,16384,13653,10922,8192,5461,2730,0,0,0,0,0,0,0,0 0,0,0,0,0,2730,5461,8192,10922,13653,16384,13653,10922,8192,5461,2730,0,0,0,0,0,0,0,0
}; };
// X X X Y | X X DC X X | X Y X X // X X X Y | X X DC X X | X Y X X
short filt24_5_dcl[24] __attribute__((aligned(16))) ={ short filt24_5_dcl[24] __attribute__((aligned(32))) ={
0,0,0,0,0,2341,4681,9362,11703,14043,16384,13653,10922,8192,5461,2730,0,0,0,0,0,0,0,0 0,0,0,0,0,2341,4681,9362,11703,14043,16384,13653,10922,8192,5461,2730,0,0,0,0,0,0,0,0
}; };
short filt24_5_dcr[24] __attribute__((aligned(16))) ={ short filt24_5_dcr[24] __attribute__((aligned(32))) ={
0,0,0,0,0,2730,5461,8192,10922,13653,16384,11703,9362,7022,4681,2730,0,0,0,0,0,0,0,0 0,0,0,0,0,2730,5461,8192,10922,13653,16384,11703,9362,7022,4681,2730,0,0,0,0,0,0,0,0
}; };
short filt24_6[24] __attribute__((aligned(16))) ={ short filt24_6[24] __attribute__((aligned(32))) ={
0,0,0,0,0,0,2730,5461,8192,10922,13653,16384,13653,10922,8192,5461,2730,0,0,0,0,0,0,0 0,0,0,0,0,0,2730,5461,8192,10922,13653,16384,13653,10922,8192,5461,2730,0,0,0,0,0,0,0
}; };
short filt24_6_dcl[24] __attribute__((aligned(16))) ={ short filt24_6_dcl[24] __attribute__((aligned(32))) ={
0,0,0,0,0,0,4681,7022,9362,11703,14043,16384,13653,10922,8192,5461,2730,0,0,0,0,0,0,0 0,0,0,0,0,0,4681,7022,9362,11703,14043,16384,13653,10922,8192,5461,2730,0,0,0,0,0,0,0
}; };
short filt24_6_dcr[24] __attribute__((aligned(16))) ={ short filt24_6_dcr[24] __attribute__((aligned(32))) ={
0,0,0,0,0,0,2730,5461,8192,10922,13653,16384,14043,11703,9362,7022,4681,0,0,0,0,0,0,0 0,0,0,0,0,0,2730,5461,8192,10922,13653,16384,14043,11703,9362,7022,4681,0,0,0,0,0,0,0
}; };
short filt24_7[24] __attribute__((aligned(16))) ={ short filt24_7[24] __attribute__((aligned(32))) ={
0,0,0,0,0,0,0,2730,5461,8192,10922,13653,16384,13653,10922,8192,5461,2730,0,0,0,0,0,0 0,0,0,0,0,0,0,2730,5461,8192,10922,13653,16384,13653,10922,8192,5461,2730,0,0,0,0,0,0
}; };
short filt24_7_dcl[24] __attribute__((aligned(16))) ={ short filt24_7_dcl[24] __attribute__((aligned(32))) ={
0,0,0,0,0,0,0,4681,7022,9362,11703,14043,16384,13653,10922,8192,5461,2730,0,0,0,0,0,0 0,0,0,0,0,0,0,4681,7022,9362,11703,14043,16384,13653,10922,8192,5461,2730,0,0,0,0,0,0
}; };
short filt24_7_dcr[24] __attribute__((aligned(16))) ={ short filt24_7_dcr[24] __attribute__((aligned(32))) ={
0,0,0,0,0,0,0,2730,5461,8192,10922,13653,16384,14043,11703,9362,7022,4681,0,0,0,0,0,0 0,0,0,0,0,0,0,2730,5461,8192,10922,13653,16384,14043,11703,9362,7022,4681,0,0,0,0,0,0
}; };
short filt24_0l[24] __attribute__((aligned(16))) ={ short filt24_0l[24] __attribute__((aligned(32))) ={
30037,27306,24576,21845,19114,16384,13653,10922,8192,5461,2730,0,0,0,0,0,0,0,0,0,0,0,0,0 30037,27306,24576,21845,19114,16384,13653,10922,8192,5461,2730,0,0,0,0,0,0,0,0,0,0,0,0,0
}; };
short filt24_1l[24] __attribute__((aligned(16))) ={ short filt24_1l[24] __attribute__((aligned(32))) ={
0,30037,27306,24576,21845,19114,16384,13653,10922,8192,5461,2730,0,0,0,0,0,0,0,0,0,0,0,0 0,30037,27306,24576,21845,19114,16384,13653,10922,8192,5461,2730,0,0,0,0,0,0,0,0,0,0,0,0
}; };
short filt24_2l[24] __attribute__((aligned(16))) ={ short filt24_2l[24] __attribute__((aligned(32))) ={
0,0,30037,27306,24576,21845,19114,16384,13653,10922,8192,5461,2730,0,0,0,0,0,0,0,0,0,0,0 0,0,30037,27306,24576,21845,19114,16384,13653,10922,8192,5461,2730,0,0,0,0,0,0,0,0,0,0,0
}; };
short filt24_3l[24] __attribute__((aligned(16))) ={ short filt24_3l[24] __attribute__((aligned(32))) ={
//0,0,0,30037,27306,24576,21845,19114,16384,13653,10922,8192,5461,2730,0,0,0,0,0,0,0,0,0,0}; //0,0,0,30037,27306,24576,21845,19114,16384,13653,10922,8192,5461,2730,0,0,0,0,0,0,0,0,0,0};
0,0,0,0,0,24576,21845,19114,16384,13653,10922,8192,5461,2730,0,0,0,0,0,0,0,0,0,0 0,0,0,0,0,24576,21845,19114,16384,13653,10922,8192,5461,2730,0,0,0,0,0,0,0,0,0,0
}; };
short filt24_4l[24] __attribute__((aligned(16))) ={ short filt24_4l[24] __attribute__((aligned(32))) ={
0,0,0,0,30037,27306,24576,21845,19114,16384,13653,10922,8192,5461,2730,0,0,0,0,0,0,0,0,0 0,0,0,0,30037,27306,24576,21845,19114,16384,13653,10922,8192,5461,2730,0,0,0,0,0,0,0,0,0
}; };
short filt24_5l[24] __attribute__((aligned(16))) ={ short filt24_5l[24] __attribute__((aligned(32))) ={
0,0,0,0,0,30037,27306,24576,21845,19114,16384,13653,10922,8192,5461,2730,0,0,0,0,0,0,0,0 0,0,0,0,0,30037,27306,24576,21845,19114,16384,13653,10922,8192,5461,2730,0,0,0,0,0,0,0,0
}; };
short filt24_6l[24] __attribute__((aligned(16))) ={ short filt24_6l[24] __attribute__((aligned(32))) ={
0,0,0,0,0,0,30037,27306,24576,21845,19114,16384,13653,10922,8192,5461,2730,0,0,0,0,0,0,0 0,0,0,0,0,0,30037,27306,24576,21845,19114,16384,13653,10922,8192,5461,2730,0,0,0,0,0,0,0
}; };
short filt24_7l[24] __attribute__((aligned(16))) ={ short filt24_7l[24] __attribute__((aligned(32))) ={
0,0,0,0,0,0,0,30037,27306,24576,21845,19114,16384,13653,10922,8192,5461,2730,0,0,0,0,0,0 0,0,0,0,0,0,0,30037,27306,24576,21845,19114,16384,13653,10922,8192,5461,2730,0,0,0,0,0,0
}; };
short filt24_0l2[24] __attribute__((aligned(16))) ={ short filt24_0l2[24] __attribute__((aligned(32))) ={
2730,5461,8192,10922,13653,16384,13653,10922,8192,5461,2730,0,0,0,0,0,0,0,0,0,0,0,0,0 2730,5461,8192,10922,13653,16384,13653,10922,8192,5461,2730,0,0,0,0,0,0,0,0,0,0,0,0,0
}; };
short filt24_1l2[24] __attribute__((aligned(16))) ={ short filt24_1l2[24] __attribute__((aligned(32))) ={
0,2730,5461,8192,10922,13653,16384,13653,10922,8192,5461,2730,0,0,0,0,0,0,0,0,0,0,0,0 0,2730,5461,8192,10922,13653,16384,13653,10922,8192,5461,2730,0,0,0,0,0,0,0,0,0,0,0,0
}; };
short filt24_2l2[24] __attribute__((aligned(16))) ={ short filt24_2l2[24] __attribute__((aligned(32))) ={
-2730,0,2730,5461,8192,10922,13653,16384,13653,10922,8192,5461,2730,0,0,0,0,0,0,0,0,0,0,0 -2730,0,2730,5461,8192,10922,13653,16384,13653,10922,8192,5461,2730,0,0,0,0,0,0,0,0,0,0,0
}; };
short filt24_3l2[24] __attribute__((aligned(16))) ={ short filt24_3l2[24] __attribute__((aligned(32))) ={
-5461,-2730,0,2730,5461,8192,10922,13653,16384,13653,10922,8192,5461,2730,0,0,0,0,0,0,0,0,0,0 -5461,-2730,0,2730,5461,8192,10922,13653,16384,13653,10922,8192,5461,2730,0,0,0,0,0,0,0,0,0,0
}; };
short filt24_4l2[24] __attribute__((aligned(16))) ={ short filt24_4l2[24] __attribute__((aligned(32))) ={
-8192,-5461,-2730,0,2730,5461,8192,10922,13653,16384,13653,10922,8192,5461,2730,0,0,0,0,0,0,0,0,0 -8192,-5461,-2730,0,2730,5461,8192,10922,13653,16384,13653,10922,8192,5461,2730,0,0,0,0,0,0,0,0,0
}; };
short filt24_5l2[24] __attribute__((aligned(16))) ={ short filt24_5l2[24] __attribute__((aligned(32))) ={
0,-8192,-5461,-2730,0,2730,5461,8192,10922,13653,16384,13653,10922,8192,5461,2730,0,0,0,0,0,0,0,0 0,-8192,-5461,-2730,0,2730,5461,8192,10922,13653,16384,13653,10922,8192,5461,2730,0,0,0,0,0,0,0,0
}; };
short filt24_6l2[24] __attribute__((aligned(16))) ={ short filt24_6l2[24] __attribute__((aligned(32))) ={
-13653,-10922,-8192,-5461,-2730,0,2730,5461,8192,10922,13653,16384,13653,10922,8192,5461,2730,0,0,0,0,0,0,0 -13653,-10922,-8192,-5461,-2730,0,2730,5461,8192,10922,13653,16384,13653,10922,8192,5461,2730,0,0,0,0,0,0,0
}; };
short filt24_7l2[24] __attribute__((aligned(16))) ={ short filt24_7l2[24] __attribute__((aligned(32))) ={
0,-13653,-10922,-8192,-5461,-2730,0,2730,5461,8192,10922,13653,16384,13653,10922,8192,5461,2730,0,0,0,0,0,0 0,-13653,-10922,-8192,-5461,-2730,0,2730,5461,8192,10922,13653,16384,13653,10922,8192,5461,2730,0,0,0,0,0,0
}; };
short filt24_0r[24] __attribute__((aligned(16))) ={ short filt24_0r[24] __attribute__((aligned(32))) ={
2730,5461,8192,10922,13653,16384,19114,21845,24576,27306,30037,0,0,0,0,0,0,0,0,0,0,0,0,0 2730,5461,8192,10922,13653,16384,19114,21845,24576,27306,30037,0,0,0,0,0,0,0,0,0,0,0,0,0
}; };
short filt24_1r[24] __attribute__((aligned(16))) ={ short filt24_1r[24] __attribute__((aligned(32))) ={
0,2730,5461,8192,10922,13653,16384,19114,21845,24576,27306,30037,0,0,0,0,0,0,0,0,0,0,0,0 0,2730,5461,8192,10922,13653,16384,19114,21845,24576,27306,30037,0,0,0,0,0,0,0,0,0,0,0,0
}; };
short filt24_2r[24] __attribute__((aligned(16))) ={ short filt24_2r[24] __attribute__((aligned(32))) ={
0,0,2730,5461,8192,10922,13653,16384,19114,21845,24576,27306,30037,0,0,0,0,0,0,0,0,0,0,0 0,0,2730,5461,8192,10922,13653,16384,19114,21845,24576,27306,30037,0,0,0,0,0,0,0,0,0,0,0
}; };
short filt24_3r[24] __attribute__((aligned(16))) ={ short filt24_3r[24] __attribute__((aligned(32))) ={
0,0,0,2730,5461,8192,10922,13653,16384,19114,21845,24576,27306,30037,0,0,0,0,0,0,0,0,0,0 0,0,0,2730,5461,8192,10922,13653,16384,19114,21845,24576,27306,30037,0,0,0,0,0,0,0,0,0,0
}; };
short filt24_4r[24] __attribute__((aligned(16))) ={ short filt24_4r[24] __attribute__((aligned(32))) ={
0,0,0,0,2730,5461,8192,10922,13653,16384,19114,21845,24576,27306,30037,0,0,0,0,0,0,0,0,0 0,0,0,0,2730,5461,8192,10922,13653,16384,19114,21845,24576,27306,30037,0,0,0,0,0,0,0,0,0
}; };
short filt24_5r[24] __attribute__((aligned(16))) ={ short filt24_5r[24] __attribute__((aligned(32))) ={
0,0,0,0,0,2730,5461,8192,10922,13653,16384,19114,21845,24576,27306,30037,0,0,0,0,0,0,0,0 0,0,0,0,0,2730,5461,8192,10922,13653,16384,19114,21845,24576,27306,30037,0,0,0,0,0,0,0,0
}; };
short filt24_6r[24] __attribute__((aligned(16))) ={ short filt24_6r[24] __attribute__((aligned(32))) ={
0,0,0,0,0,0,2730,5461,8192,10922,13653,16384,19114,21845,24576,27306,30037,0,0,0,0,0,0,0 0,0,0,0,0,0,2730,5461,8192,10922,13653,16384,19114,21845,24576,27306,30037,0,0,0,0,0,0,0
}; };
short filt24_7r[24] __attribute__((aligned(16))) ={ short filt24_7r[24] __attribute__((aligned(32))) ={
0,0,0,0,0,0,0,2730,5461,8192,10922,13653,16384,19114,21845,24576,27306,30037,0,0,0,0,0,0 0,0,0,0,0,0,0,2730,5461,8192,10922,13653,16384,19114,21845,24576,27306,30037,0,0,0,0,0,0
}; };
short filt24_0r2[24] __attribute__((aligned(16))) ={ /****/ short filt24_0r2[24] __attribute__((aligned(32))) ={ /****/
2730,5461,8192,10922,13653,16384,13653,10922,8192,5461,2730,0,-2730,-5461,-8192,-10922,-13653,0,0,0,0,0,0,0 2730,5461,8192,10922,13653,16384,13653,10922,8192,5461,2730,0,-2730,-5461,-8192,-10922,-13653,0,0,0,0,0,0,0
}; };
short filt24_1r2[24] __attribute__((aligned(16))) ={ short filt24_1r2[24] __attribute__((aligned(32))) ={
0,2730,5461,8192,10922,13653,16384,13653,10922,8192,5461,2730,0,-2730,-5461,-8192,-10922,-13653,0,0,0,0,0,0 0,2730,5461,8192,10922,13653,16384,13653,10922,8192,5461,2730,0,-2730,-5461,-8192,-10922,-13653,0,0,0,0,0,0
}; };
short filt24_2r2[24] __attribute__((aligned(16))) ={ short filt24_2r2[24] __attribute__((aligned(32))) ={
0,0,2730,5461,8192,10922,13653,16384,13653,10922,8192,5461,2730,0,-2730,-5461,-8192,-10922,-13653,0,0,0,0,0 0,0,2730,5461,8192,10922,13653,16384,13653,10922,8192,5461,2730,0,-2730,-5461,-8192,-10922,-13653,0,0,0,0,0
}; };
short filt24_3r2[24] __attribute__((aligned(16))) ={ short filt24_3r2[24] __attribute__((aligned(32))) ={
0,0,0,2730,5461,8192,10922,13653,16384,13653,10922,8192,5461,2730,0,-2730,-5461,-8192,-10922,-13653,0,0,0,0 0,0,0,2730,5461,8192,10922,13653,16384,13653,10922,8192,5461,2730,0,-2730,-5461,-8192,-10922,-13653,0,0,0,0
}; };
short filt24_4r2[24] __attribute__((aligned(16))) ={ short filt24_4r2[24] __attribute__((aligned(32))) ={
0,0,0,0,2730,5461,8192,10922,13653,16384,13653,10922,8192,5461,2730,0,-2730,-5461,-8192,-10922,-13653,0,0,0 0,0,0,0,2730,5461,8192,10922,13653,16384,13653,10922,8192,5461,2730,0,-2730,-5461,-8192,-10922,-13653,0,0,0
}; };
short filt24_5r2[24] __attribute__((aligned(16))) ={ short filt24_5r2[24] __attribute__((aligned(32))) ={
0,0,0,0,0,2730,5461,8192,10922,13653,16384,13653,10922,8192,5461,2730,0,-2730,-5461,-8192,-10922,-13653,0,0 0,0,0,0,0,2730,5461,8192,10922,13653,16384,13653,10922,8192,5461,2730,0,-2730,-5461,-8192,-10922,-13653,0,0
}; };
short filt24_6r2[24] __attribute__((aligned(16))) ={ short filt24_6r2[24] __attribute__((aligned(32))) ={
0,0,0,0,0,0,2730,5461,8192,10922,13653,16384,13653,10922,8192,5461,2730,0,-2730,-5461,-8192,-10922,-13653,0 0,0,0,0,0,0,2730,5461,8192,10922,13653,16384,13653,10922,8192,5461,2730,0,-2730,-5461,-8192,-10922,-13653,0
}; };
short filt24_7r2[24] __attribute__((aligned(16))) ={ short filt24_7r2[24] __attribute__((aligned(32))) ={
0,0,0,0,0,0,0,2730,5461,8192,10922,13653,16384,13653,10922,8192,5461,2730,0,-2730,-5461,-8192,-10922,-13653 0,0,0,0,0,0,0,2730,5461,8192,10922,13653,16384,13653,10922,8192,5461,2730,0,-2730,-5461,-8192,-10922,-13653
}; };
...@@ -203,8 +203,9 @@ int lte_dl_channel_estimation(PHY_VARS_UE *phy_vars_ue, ...@@ -203,8 +203,9 @@ int lte_dl_channel_estimation(PHY_VARS_UE *phy_vars_ue,
multadd_complex_vector_real_scalar(dl_ch-(phy_vars_ue->lte_frame_parms.ofdm_symbol_size<<1), multadd_complex_vector_real_scalar(dl_ch-(phy_vars_ue->lte_frame_parms.ofdm_symbol_size<<1),
phy_vars_ue->ch_est_alpha,dl_ch-(phy_vars_ue->lte_frame_parms.ofdm_symbol_size<<1), phy_vars_ue->ch_est_alpha,dl_ch-(phy_vars_ue->lte_frame_parms.ofdm_symbol_size<<1),
1,phy_vars_ue->lte_frame_parms.ofdm_symbol_size); 1,phy_vars_ue->lte_frame_parms.ofdm_symbol_size);
#ifdef DEBUG_CH
printf("k %d, first_carrier %d\n",k,phy_vars_ue->lte_frame_parms.first_carrier_offset);
#endif
if ((phy_vars_ue->lte_frame_parms.N_RB_DL==6) || if ((phy_vars_ue->lte_frame_parms.N_RB_DL==6) ||
(phy_vars_ue->lte_frame_parms.N_RB_DL==50) || (phy_vars_ue->lte_frame_parms.N_RB_DL==50) ||
(phy_vars_ue->lte_frame_parms.N_RB_DL==100)) { (phy_vars_ue->lte_frame_parms.N_RB_DL==100)) {
...@@ -213,7 +214,9 @@ int lte_dl_channel_estimation(PHY_VARS_UE *phy_vars_ue, ...@@ -213,7 +214,9 @@ int lte_dl_channel_estimation(PHY_VARS_UE *phy_vars_ue,
// Treat first 2 pilots specially (left edge) // Treat first 2 pilots specially (left edge)
ch[0] = (int16_t)(((int32_t)pil[0]*rxF[0] - (int32_t)pil[1]*rxF[1])>>15); ch[0] = (int16_t)(((int32_t)pil[0]*rxF[0] - (int32_t)pil[1]*rxF[1])>>15);
ch[1] = (int16_t)(((int32_t)pil[0]*rxF[1] + (int32_t)pil[1]*rxF[0])>>15); ch[1] = (int16_t)(((int32_t)pil[0]*rxF[1] + (int32_t)pil[1]*rxF[0])>>15);
// printf("pilot 0 : rxF - > (%d,%d) ch -> (%d,%d), pil -> (%d,%d) \n",rxF[0],rxF[1],ch[0],ch[1],pil[0],pil[1]); #ifdef DEBUG_CH
printf("pilot 0 : rxF - > (%d,%d) ch -> (%d,%d), pil -> (%d,%d) \n",rxF[0],rxF[1],ch[0],ch[1],pil[0],pil[1]);
#endif
multadd_real_vector_complex_scalar(fl, multadd_real_vector_complex_scalar(fl,
ch, ch,
dl_ch, dl_ch,
...@@ -224,7 +227,9 @@ int lte_dl_channel_estimation(PHY_VARS_UE *phy_vars_ue, ...@@ -224,7 +227,9 @@ int lte_dl_channel_estimation(PHY_VARS_UE *phy_vars_ue,
ch[0] = (int16_t)(((int32_t)pil[0]*rxF[0] - (int32_t)pil[1]*rxF[1])>>15); ch[0] = (int16_t)(((int32_t)pil[0]*rxF[0] - (int32_t)pil[1]*rxF[1])>>15);
ch[1] = (int16_t)(((int32_t)pil[0]*rxF[1] + (int32_t)pil[1]*rxF[0])>>15); ch[1] = (int16_t)(((int32_t)pil[0]*rxF[1] + (int32_t)pil[1]*rxF[0])>>15);
// printf("pilot 1 : rxF - > (%d,%d) ch -> (%d,%d), pil -> (%d,%d) \n",rxF[0],rxF[1],ch[0],ch[1],pil[0],pil[1]); #ifdef DEBUG_CH
printf("pilot 1 : rxF - > (%d,%d) ch -> (%d,%d), pil -> (%d,%d) \n",rxF[0],rxF[1],ch[0],ch[1],pil[0],pil[1]);
#endif
multadd_real_vector_complex_scalar(f2l2, multadd_real_vector_complex_scalar(f2l2,
ch, ch,
dl_ch, dl_ch,
...@@ -235,15 +240,13 @@ int lte_dl_channel_estimation(PHY_VARS_UE *phy_vars_ue, ...@@ -235,15 +240,13 @@ int lte_dl_channel_estimation(PHY_VARS_UE *phy_vars_ue,
for (pilot_cnt=2; pilot_cnt<((phy_vars_ue->lte_frame_parms.N_RB_DL)-1); pilot_cnt+=2) { for (pilot_cnt=2; pilot_cnt<((phy_vars_ue->lte_frame_parms.N_RB_DL)-1); pilot_cnt+=2) {
// printf("%d\n",dl_ch-(int16_t *)&dl_ch_estimates[(p<<1)+aarx][ch_offset]);
// printf("pilot[%d][%d] (%d,%d)\n",p,pilot_cnt,pil[0],pil[1]);
// printf("rx[%d] -> (%d,%d)\n", k, rxF[0], rxF[1]);
ch[0] = (int16_t)(((int32_t)pil[0]*rxF[0] - (int32_t)pil[1]*rxF[1])>>15); //Re ch[0] = (int16_t)(((int32_t)pil[0]*rxF[0] - (int32_t)pil[1]*rxF[1])>>15); //Re
ch[1] = (int16_t)(((int32_t)pil[0]*rxF[1] + (int32_t)pil[1]*rxF[0])>>15); //Im ch[1] = (int16_t)(((int32_t)pil[0]*rxF[1] + (int32_t)pil[1]*rxF[0])>>15); //Im
// printf("**rb %d %d\n",rb,dl_ch-(int16_t *)&dl_ch_estimates[(p<<1)+aarx][ch_offset]); #ifdef DEBUG_CH
printf("pilot %d : rxF - > (%d,%d) ch -> (%d,%d), pil -> (%d,%d) \n",pilot_cnt,rxF[0],rxF[1],ch[0],ch[1],pil[0],pil[1]);
#endif
multadd_real_vector_complex_scalar(f, multadd_real_vector_complex_scalar(f,
ch, ch,
dl_ch, dl_ch,
...@@ -254,13 +257,11 @@ int lte_dl_channel_estimation(PHY_VARS_UE *phy_vars_ue, ...@@ -254,13 +257,11 @@ int lte_dl_channel_estimation(PHY_VARS_UE *phy_vars_ue,
rxF+=12; rxF+=12;
dl_ch+=8; dl_ch+=8;
// printf("pilot[%d][%d] (%d,%d)\n",p,rb,pil[0],pil[1]);
// printf("rx[%d] -> (%d,%d)\n", k+6, rxF[0], rxF[1]);
ch[0] = (int16_t)(((int32_t)pil[0]*rxF[0] - (int32_t)pil[1]*rxF[1])>>15); ch[0] = (int16_t)(((int32_t)pil[0]*rxF[0] - (int32_t)pil[1]*rxF[1])>>15);
ch[1] = (int16_t)(((int32_t)pil[0]*rxF[1] + (int32_t)pil[1]*rxF[0])>>15); ch[1] = (int16_t)(((int32_t)pil[0]*rxF[1] + (int32_t)pil[1]*rxF[0])>>15);
// printf("**rb %d %d\n",rb,dl_ch-(int16_t *)&dl_ch_estimates[(p<<1)+aarx][ch_offset]); #ifdef DEBUG_CH
printf("pilot %d : rxF - > (%d,%d) ch -> (%d,%d), pil -> (%d,%d) \n",pilot_cnt+1,rxF[0],rxF[1],ch[0],ch[1],pil[0],pil[1]);
#endif
multadd_real_vector_complex_scalar(f2, multadd_real_vector_complex_scalar(f2,
ch, ch,
dl_ch, dl_ch,
...@@ -281,15 +282,17 @@ int lte_dl_channel_estimation(PHY_VARS_UE *phy_vars_ue, ...@@ -281,15 +282,17 @@ int lte_dl_channel_estimation(PHY_VARS_UE *phy_vars_ue,
rxF = (int16_t *)&rxdataF[aarx][((symbol_offset+1+k))]; rxF = (int16_t *)&rxdataF[aarx][((symbol_offset+1+k))];
#ifdef DEBUG_CH
printf("second half k %d\n",k);
#endif
for (pilot_cnt=0; pilot_cnt<((phy_vars_ue->lte_frame_parms.N_RB_DL)-3); pilot_cnt+=2) { for (pilot_cnt=0; pilot_cnt<((phy_vars_ue->lte_frame_parms.N_RB_DL)-3); pilot_cnt+=2) {
// printf("pilot[%d][%d] (%d,%d)\n",p,pilot_cnt,pil[0],pil[1]);
// printf("rx[%d] -> (%d,%d)\n", k+6, rxF[0], rxF[1]);
ch[0] = (int16_t)(((int32_t)pil[0]*rxF[0] - (int32_t)pil[1]*rxF[1])>>15); ch[0] = (int16_t)(((int32_t)pil[0]*rxF[0] - (int32_t)pil[1]*rxF[1])>>15);
ch[1] = (int16_t)(((int32_t)pil[0]*rxF[1] + (int32_t)pil[1]*rxF[0])>>15); ch[1] = (int16_t)(((int32_t)pil[0]*rxF[1] + (int32_t)pil[1]*rxF[0])>>15);
#ifdef DEBUG_CH
// printf("**rb %d %d\n",rb,dl_ch-(int16_t *)&dl_ch_estimates[(p<<1)+aarx][ch_offset]); printf("pilot %d : rxF - > (%d,%d) ch -> (%d,%d), pil -> (%d,%d) \n",pilot_cnt,rxF[0],rxF[1],ch[0],ch[1],pil[0],pil[1]);
#endif
multadd_real_vector_complex_scalar(f, multadd_real_vector_complex_scalar(f,
ch, ch,
dl_ch, dl_ch,
...@@ -300,8 +303,9 @@ int lte_dl_channel_estimation(PHY_VARS_UE *phy_vars_ue, ...@@ -300,8 +303,9 @@ int lte_dl_channel_estimation(PHY_VARS_UE *phy_vars_ue,
ch[0] = (int16_t)(((int32_t)pil[0]*rxF[0] - (int32_t)pil[1]*rxF[1])>>15); ch[0] = (int16_t)(((int32_t)pil[0]*rxF[0] - (int32_t)pil[1]*rxF[1])>>15);
ch[1] = (int16_t)(((int32_t)pil[0]*rxF[1] + (int32_t)pil[1]*rxF[0])>>15); ch[1] = (int16_t)(((int32_t)pil[0]*rxF[1] + (int32_t)pil[1]*rxF[0])>>15);
#ifdef DEBUG_CH
// printf("**rb %d %d\n",rb,dl_ch-(int16_T *)&dl_ch_estimates[(p<<1)+aarx][ch_offset]); printf("pilot %d : rxF - > (%d,%d) ch -> (%d,%d), pil -> (%d,%d) \n",pilot_cnt+1,rxF[0],rxF[1],ch[0],ch[1],pil[0],pil[1]);
#endif
multadd_real_vector_complex_scalar(f2, multadd_real_vector_complex_scalar(f2,
ch, ch,
dl_ch, dl_ch,
...@@ -314,8 +318,9 @@ int lte_dl_channel_estimation(PHY_VARS_UE *phy_vars_ue, ...@@ -314,8 +318,9 @@ int lte_dl_channel_estimation(PHY_VARS_UE *phy_vars_ue,
ch[0] = (int16_t)(((int32_t)pil[0]*rxF[0] - (int32_t)pil[1]*rxF[1])>>15); ch[0] = (int16_t)(((int32_t)pil[0]*rxF[0] - (int32_t)pil[1]*rxF[1])>>15);
ch[1] = (int16_t)(((int32_t)pil[0]*rxF[1] + (int32_t)pil[1]*rxF[0])>>15); ch[1] = (int16_t)(((int32_t)pil[0]*rxF[1] + (int32_t)pil[1]*rxF[0])>>15);
// printf("pilot 49: rxF -> (%d,%d) ch -> (%d,%d), pil -> (%d,%d) \n",rxF[0],rxF[1],ch[0],ch[1],pil[0],pil[1]); #ifdef DEBUG_CH
printf("pilot %d: rxF -> (%d,%d) ch -> (%d,%d), pil -> (%d,%d) \n",pilot_cnt,rxF[0],rxF[1],ch[0],ch[1],pil[0],pil[1]);
#endif
multadd_real_vector_complex_scalar(fr, multadd_real_vector_complex_scalar(fr,
ch, ch,
dl_ch, dl_ch,
...@@ -326,7 +331,9 @@ int lte_dl_channel_estimation(PHY_VARS_UE *phy_vars_ue, ...@@ -326,7 +331,9 @@ int lte_dl_channel_estimation(PHY_VARS_UE *phy_vars_ue,
ch[0] = (int16_t)(((int32_t)pil[0]*rxF[0] - (int32_t)pil[1]*rxF[1])>>15); ch[0] = (int16_t)(((int32_t)pil[0]*rxF[0] - (int32_t)pil[1]*rxF[1])>>15);
ch[1] = (int16_t)(((int32_t)pil[0]*rxF[1] + (int32_t)pil[1]*rxF[0])>>15); ch[1] = (int16_t)(((int32_t)pil[0]*rxF[1] + (int32_t)pil[1]*rxF[0])>>15);
// printf("pilot 50: rxF - > (%d,%d) ch -> (%d,%d), pil -> (%d,%d) \n",rxF[0],rxF[1],ch[0],ch[1],pil[0],pil[1]); #ifdef DEBUG_CH
printf("pilot %d: rxF - > (%d,%d) ch -> (%d,%d), pil -> (%d,%d) \n",pilot_cnt+1,rxF[0],rxF[1],ch[0],ch[1],pil[0],pil[1]);
#endif
multadd_real_vector_complex_scalar(f2r2, multadd_real_vector_complex_scalar(f2r2,
ch, ch,
dl_ch, dl_ch,
......
...@@ -52,8 +52,8 @@ ...@@ -52,8 +52,8 @@
int* sync_corr_ue0 = NULL; int* sync_corr_ue0 = NULL;
int* sync_corr_ue1 = NULL; int* sync_corr_ue1 = NULL;
int* sync_corr_ue2 = NULL; int* sync_corr_ue2 = NULL;
int sync_tmp[2048*4] __attribute__((aligned(16))); int sync_tmp[2048*4] __attribute__((aligned(32)));
short syncF_tmp[2048*2] __attribute__((aligned(16))); short syncF_tmp[2048*2] __attribute__((aligned(32)));
......
...@@ -56,8 +56,8 @@ void lte_sync_timefreq(PHY_VARS_UE *ue,int band,unsigned int DL_freq) ...@@ -56,8 +56,8 @@ void lte_sync_timefreq(PHY_VARS_UE *ue,int band,unsigned int DL_freq)
{ {
#if defined(__x86_64__) || defined(__i386__) #if defined(__x86_64__) || defined(__i386__)
UE_SCAN_INFO_t *scan_info = &ue->scan_info[band]; UE_SCAN_INFO_t *scan_info = &ue->scan_info[band];
int16_t spectrum[12288] __attribute__((aligned(16))); int16_t spectrum[12288] __attribute__((aligned(32)));
int16_t spectrum_p5ms[12288] __attribute__((aligned(16))); int16_t spectrum_p5ms[12288] __attribute__((aligned(32)));
int i,f,band_idx; int i,f,band_idx;
__m128i autocorr0[256/4],autocorr1[256/4],autocorr2[256/4]; __m128i autocorr0[256/4],autocorr1[256/4],autocorr2[256/4];
__m128i autocorr0_t[256/4],autocorr1_t[256/4],autocorr2_t[256/4]; __m128i autocorr0_t[256/4],autocorr1_t[256/4],autocorr2_t[256/4];
......
...@@ -61,21 +61,18 @@ void lte_gold(LTE_DL_FRAME_PARMS *frame_parms,uint32_t lte_gold_table[20][2][14] ...@@ -61,21 +61,18 @@ void lte_gold(LTE_DL_FRAME_PARMS *frame_parms,uint32_t lte_gold_table[20][2][14]
x2 = Ncp + x2 = Ncp +
(Nid_cell<<1) + (Nid_cell<<1) +
(((1+(Nid_cell<<1))*(1 + (((frame_parms->Ncp==0)?4:3)*l) + (7*(1+ns))))<<10); //cinit (((1+(Nid_cell<<1))*(1 + (((frame_parms->Ncp==0)?4:3)*l) + (7*(1+ns))))<<10); //cinit
//x2 = frame_parms->Ncp + (Nid_cell<<1) + (1+(Nid_cell<<1))*(1 + (3*l) + (7*(1+ns))); //cinit //x2 = frame_parms->Ncp + (Nid_cell<<1) + (1+(Nid_cell<<1))*(1 + (3*l) + (7*(1+ns))); //cinit
//n = 0 //n = 0
// printf("cinit (ns %d, l %d) => %d\n",ns,l,x2);
x1 = 1+ (1<<31); x1 = 1+ (1<<31);
x2=x2 ^ ((x2 ^ (x2>>1) ^ (x2>>2) ^ (x2>>3))<<31); x2=x2 ^ ((x2 ^ (x2>>1) ^ (x2>>2) ^ (x2>>3))<<31);
// skip first 50 double words (1600 bits) // skip first 50 double words (1600 bits)
//printf("n=0 : x1 %x, x2 %x\n",x1,x2);
for (n=1; n<50; n++) { for (n=1; n<50; n++) {
x1 = (x1>>1) ^ (x1>>4); x1 = (x1>>1) ^ (x1>>4);
x1 = x1 ^ (x1<<31) ^ (x1<<28); x1 = x1 ^ (x1<<31) ^ (x1<<28);
x2 = (x2>>1) ^ (x2>>2) ^ (x2>>3) ^ (x2>>4); x2 = (x2>>1) ^ (x2>>2) ^ (x2>>3) ^ (x2>>4);
x2 = x2 ^ (x2<<31) ^ (x2<<30) ^ (x2<<29) ^ (x2<<28); x2 = x2 ^ (x2<<31) ^ (x2<<30) ^ (x2<<29) ^ (x2<<28);
// printf("x1 : %x, x2 : %x\n",x1,x2);
} }
for (n=0; n<14; n++) { for (n=0; n<14; n++) {
...@@ -84,7 +81,6 @@ void lte_gold(LTE_DL_FRAME_PARMS *frame_parms,uint32_t lte_gold_table[20][2][14] ...@@ -84,7 +81,6 @@ void lte_gold(LTE_DL_FRAME_PARMS *frame_parms,uint32_t lte_gold_table[20][2][14]
x2 = (x2>>1) ^ (x2>>2) ^ (x2>>3) ^ (x2>>4); x2 = (x2>>1) ^ (x2>>2) ^ (x2>>3) ^ (x2>>4);
x2 = x2 ^ (x2<<31) ^ (x2<<30) ^ (x2<<29) ^ (x2<<28); x2 = x2 ^ (x2<<31) ^ (x2<<30) ^ (x2<<29) ^ (x2<<28);
lte_gold_table[ns][l][n] = x1^x2; lte_gold_table[ns][l][n] = x1^x2;
// printf("n=%d : c %x\n",n,x1^x2);
} }
} }
......
...@@ -117,7 +117,7 @@ LTE_UE_DLSCH_t *new_ue_dlsch(uint8_t Kmimo,uint8_t Mdlharq,uint32_t Nsoft,uint8_ ...@@ -117,7 +117,7 @@ LTE_UE_DLSCH_t *new_ue_dlsch(uint8_t Kmimo,uint8_t Mdlharq,uint32_t Nsoft,uint8_
dlsch->max_turbo_iterations = max_turbo_iterations; dlsch->max_turbo_iterations = max_turbo_iterations;
for (i=0; i<Mdlharq; i++) { for (i=0; i<Mdlharq; i++) {
// msg("new_ue_dlsch: Harq process %d\n",i); // printf("new_ue_dlsch: Harq process %d\n",i);
dlsch->harq_processes[i] = (LTE_DL_UE_HARQ_t *)malloc16(sizeof(LTE_DL_UE_HARQ_t)); dlsch->harq_processes[i] = (LTE_DL_UE_HARQ_t *)malloc16(sizeof(LTE_DL_UE_HARQ_t));
if (dlsch->harq_processes[i]) { if (dlsch->harq_processes[i]) {
...@@ -156,7 +156,7 @@ LTE_UE_DLSCH_t *new_ue_dlsch(uint8_t Kmimo,uint8_t Mdlharq,uint32_t Nsoft,uint8_ ...@@ -156,7 +156,7 @@ LTE_UE_DLSCH_t *new_ue_dlsch(uint8_t Kmimo,uint8_t Mdlharq,uint32_t Nsoft,uint8_
return(dlsch); return(dlsch);
} }
msg("new_ue_dlsch with size %zu: exit_flag = %u\n",sizeof(LTE_DL_UE_HARQ_t), exit_flag); printf("new_ue_dlsch with size %zu: exit_flag = %u\n",sizeof(LTE_DL_UE_HARQ_t), exit_flag);
free_ue_dlsch(dlsch); free_ue_dlsch(dlsch);
return(NULL); return(NULL);
...@@ -187,6 +187,27 @@ uint32_t dlsch_decoding(PHY_VARS_UE *phy_vars_ue, ...@@ -187,6 +187,27 @@ uint32_t dlsch_decoding(PHY_VARS_UE *phy_vars_ue,
uint8_t crc_type; uint8_t crc_type;
#ifdef DEBUG_DLSCH_DECODING #ifdef DEBUG_DLSCH_DECODING
uint16_t i; uint16_t i;
#endif
#ifdef __AVX2__
int Kr_last,skipped_last=0;
uint8_t (*tc_2cw)(int16_t *y,
int16_t *y2,
uint8_t *,
uint8_t *,
uint16_t,
uint16_t,
uint16_t,
uint8_t,
uint8_t,
uint8_t,
time_stats_t *,
time_stats_t *,
time_stats_t *,
time_stats_t *,
time_stats_t *,
time_stats_t *,
time_stats_t *);
#endif #endif
uint8_t (*tc)(int16_t *y, uint8_t (*tc)(int16_t *y,
uint8_t *, uint8_t *,
...@@ -204,28 +225,35 @@ uint32_t dlsch_decoding(PHY_VARS_UE *phy_vars_ue, ...@@ -204,28 +225,35 @@ uint32_t dlsch_decoding(PHY_VARS_UE *phy_vars_ue,
time_stats_t *, time_stats_t *,
time_stats_t *); time_stats_t *);
if (!dlsch_llr) { if (!dlsch_llr) {
msg("dlsch_decoding.c: NULL dlsch_llr pointer\n"); printf("dlsch_decoding.c: NULL dlsch_llr pointer\n");
return(dlsch->max_turbo_iterations); return(dlsch->max_turbo_iterations);
} }
if (!harq_process) { if (!harq_process) {
msg("dlsch_decoding.c: NULL harq_process pointer\n"); printf("dlsch_decoding.c: NULL harq_process pointer\n");
return(dlsch->max_turbo_iterations); return(dlsch->max_turbo_iterations);
} }
if (!frame_parms) { if (!frame_parms) {
msg("dlsch_decoding.c: NULL frame_parms pointer\n"); printf("dlsch_decoding.c: NULL frame_parms pointer\n");
return(dlsch->max_turbo_iterations); return(dlsch->max_turbo_iterations);
} }
if (subframe>9) { if (subframe>9) {
msg("dlsch_decoding.c: Illegal subframe index %d\n",subframe); printf("dlsch_decoding.c: Illegal subframe index %d\n",subframe);
return(dlsch->max_turbo_iterations); return(dlsch->max_turbo_iterations);
} }
if (llr8_flag == 0) if (llr8_flag == 0) {
#ifdef __AVX2__
tc_2cw = phy_threegpplte_turbo_decoder16avx2;
#endif
tc = phy_threegpplte_turbo_decoder16; tc = phy_threegpplte_turbo_decoder16;
}
else else
tc = phy_threegpplte_turbo_decoder8; tc = phy_threegpplte_turbo_decoder8;
...@@ -233,13 +261,13 @@ uint32_t dlsch_decoding(PHY_VARS_UE *phy_vars_ue, ...@@ -233,13 +261,13 @@ uint32_t dlsch_decoding(PHY_VARS_UE *phy_vars_ue,
/* /*
if (nb_rb > frame_parms->N_RB_DL) { if (nb_rb > frame_parms->N_RB_DL) {
msg("dlsch_decoding.c: Illegal nb_rb %d\n",nb_rb); printf("dlsch_decoding.c: Illegal nb_rb %d\n",nb_rb);
return(max_turbo_iterations); return(max_turbo_iterations);
}*/ }*/
/*harq_pid = dlsch->current_harq_pid; /*harq_pid = dlsch->current_harq_pid;
if (harq_pid >= 8) { if (harq_pid >= 8) {
msg("dlsch_decoding.c: Illegal harq_pid %d\n",harq_pid); printf("dlsch_decoding.c: Illegal harq_pid %d\n",harq_pid);
return(max_turbo_iterations); return(max_turbo_iterations);
} }
*/ */
...@@ -254,7 +282,7 @@ uint32_t dlsch_decoding(PHY_VARS_UE *phy_vars_ue, ...@@ -254,7 +282,7 @@ uint32_t dlsch_decoding(PHY_VARS_UE *phy_vars_ue,
G = harq_process->G; G = harq_process->G;
//get_G(frame_parms,nb_rb,dlsch->rb_alloc,mod_order,num_pdcch_symbols,phy_vars_ue->frame,subframe); //get_G(frame_parms,nb_rb,dlsch->rb_alloc,mod_order,num_pdcch_symbols,phy_vars_ue->frame,subframe);
// msg("DLSCH Decoding, harq_pid %d Ndi %d\n",harq_pid,harq_process->Ndi); // printf("DLSCH Decoding, harq_pid %d Ndi %d\n",harq_pid,harq_process->Ndi);
if (harq_process->round == 0) { if (harq_process->round == 0) {
// This is a new packet, so compute quantities regarding segmentation // This is a new packet, so compute quantities regarding segmentation
...@@ -273,7 +301,7 @@ uint32_t dlsch_decoding(PHY_VARS_UE *phy_vars_ue, ...@@ -273,7 +301,7 @@ uint32_t dlsch_decoding(PHY_VARS_UE *phy_vars_ue,
/* /*
else { else {
msg("dlsch_decoding.c: Ndi>0 not checked yet!!\n"); printf("dlsch_decoding.c: Ndi>0 not checked yet!!\n");
return(max_turbo_iterations); return(max_turbo_iterations);
} }
*/ */
...@@ -300,10 +328,14 @@ uint32_t dlsch_decoding(PHY_VARS_UE *phy_vars_ue, ...@@ -300,10 +328,14 @@ uint32_t dlsch_decoding(PHY_VARS_UE *phy_vars_ue,
break; break;
} }
if (harq_process->C >= MAX_NUM_DLSCH_SEGMENTS/bw_scaling) { if (harq_process->C > MAX_NUM_DLSCH_SEGMENTS/bw_scaling) {
LOG_E(PHY,"Illegal harq_process->C %d > %d\n",harq_process->C,MAX_NUM_DLSCH_SEGMENTS/bw_scaling); LOG_E(PHY,"Illegal harq_process->C %d > %d\n",harq_process->C,MAX_NUM_DLSCH_SEGMENTS/bw_scaling);
return((1+dlsch->max_turbo_iterations)); return((1+dlsch->max_turbo_iterations));
} }
#ifdef DEBUG_DLSCH_DECODING
printf("Segmentation: C %d, Cminus %d, Kminus %d, Kplus %d\n",harq_process->C,harq_process->Cminus,harq_process->Kminus,harq_process->Kplus);
#endif
for (r=0; r<harq_process->C; r++) { for (r=0; r<harq_process->C; r++) {
...@@ -324,7 +356,7 @@ uint32_t dlsch_decoding(PHY_VARS_UE *phy_vars_ue, ...@@ -324,7 +356,7 @@ uint32_t dlsch_decoding(PHY_VARS_UE *phy_vars_ue,
else if (Kr_bytes <= 768) else if (Kr_bytes <= 768)
iind = 123 + ((Kr_bytes-256)>>3); iind = 123 + ((Kr_bytes-256)>>3);
else { else {
msg("dlsch_decoding: Illegal codeword size %d!!!\n",Kr_bytes); printf("dlsch_decoding: Illegal codeword size %d!!!\n",Kr_bytes);
return(dlsch->max_turbo_iterations); return(dlsch->max_turbo_iterations);
} }
...@@ -418,15 +450,12 @@ uint32_t dlsch_decoding(PHY_VARS_UE *phy_vars_ue, ...@@ -418,15 +450,12 @@ uint32_t dlsch_decoding(PHY_VARS_UE *phy_vars_ue,
printf("\n"); printf("\n");
*/ */
//#ifndef __AVX2__
#if 1
if (err_flag == 0) { if (err_flag == 0) {
start_meas(dlsch_turbo_decoding_stats); start_meas(dlsch_turbo_decoding_stats);
#ifdef TURBO_S
ret = phy_threegpplte_turbo_decoder_scalar
#else
ret = tc ret = tc
#endif
(&harq_process->d[r][96], (&harq_process->d[r][96],
harq_process->c[r], harq_process->c[r],
Kr, Kr,
...@@ -446,7 +475,130 @@ uint32_t dlsch_decoding(PHY_VARS_UE *phy_vars_ue, ...@@ -446,7 +475,130 @@ uint32_t dlsch_decoding(PHY_VARS_UE *phy_vars_ue,
stop_meas(dlsch_turbo_decoding_stats); stop_meas(dlsch_turbo_decoding_stats);
} }
#else
if ((harq_process->C == 1) ||
((r==harq_process->C-1) && (skipped_last==0))) { // last segment with odd number of segments
start_meas(dlsch_turbo_decoding_stats);
ret = tc
(&harq_process->d[r][96],
harq_process->c[r],
Kr,
f1f2mat_old[iind*2],
f1f2mat_old[(iind*2)+1],
dlsch->max_turbo_iterations,
crc_type,
(r==0) ? harq_process->F : 0,
&phy_vars_ue->dlsch_tc_init_stats,
&phy_vars_ue->dlsch_tc_alpha_stats,
&phy_vars_ue->dlsch_tc_beta_stats,
&phy_vars_ue->dlsch_tc_gamma_stats,
&phy_vars_ue->dlsch_tc_ext_stats,
&phy_vars_ue->dlsch_tc_intl1_stats,
&phy_vars_ue->dlsch_tc_intl2_stats); //(is_crnti==0)?harq_pid:harq_pid+1);
stop_meas(dlsch_turbo_decoding_stats);
// printf("single decode, exit\n");
// exit(-1);
}
else {
// we can merge code segments
if ((skipped_last == 0) && (r<harq_process->C-1)) {
skipped_last = 1;
Kr_last = Kr;
}
else {
skipped_last=0;
if (Kr_last == Kr) { // decode 2 code segments with AVX2 version
#ifdef DEBUG_DLSCH_DECODING
printf("single decoding segment %d (%p)\n",r-1,&harq_process->d[r-1][96]);
#endif
start_meas(dlsch_turbo_decoding_stats);
#ifdef DEBUG_DLSCH_DECODING
printf("double decoding segments %d,%d (%p,%p)\n",r-1,r,&harq_process->d[r-1][96],&harq_process->d[r][96]);
#endif
ret = tc_2cw
(&harq_process->d[r-1][96],
&harq_process->d[r][96],
harq_process->c[r-1],
harq_process->c[r],
Kr,
f1f2mat_old[iind*2],
f1f2mat_old[(iind*2)+1],
dlsch->max_turbo_iterations,
crc_type,
(r==0) ? harq_process->F : 0,
&phy_vars_ue->dlsch_tc_init_stats,
&phy_vars_ue->dlsch_tc_alpha_stats,
&phy_vars_ue->dlsch_tc_beta_stats,
&phy_vars_ue->dlsch_tc_gamma_stats,
&phy_vars_ue->dlsch_tc_ext_stats,
&phy_vars_ue->dlsch_tc_intl1_stats,
&phy_vars_ue->dlsch_tc_intl2_stats); //(is_crnti==0)?harq_pid:harq_pid+1);
/*
ret = tc
(&harq_process->d[r-1][96],
harq_process->c[r-1],
Kr_last,
f1f2mat_old[iind*2],
f1f2mat_old[(iind*2)+1],
dlsch->max_turbo_iterations,
crc_type,
(r==0) ? harq_process->F : 0,
&phy_vars_ue->dlsch_tc_init_stats,
&phy_vars_ue->dlsch_tc_alpha_stats,
&phy_vars_ue->dlsch_tc_beta_stats,
&phy_vars_ue->dlsch_tc_gamma_stats,
&phy_vars_ue->dlsch_tc_ext_stats,
&phy_vars_ue->dlsch_tc_intl1_stats,
&phy_vars_ue->dlsch_tc_intl2_stats); //(is_crnti==0)?harq_pid:harq_pid+1);
exit(-1);*/
stop_meas(dlsch_turbo_decoding_stats);
}
else { // Kr_last != Kr
start_meas(dlsch_turbo_decoding_stats);
ret = tc
(&harq_process->d[r-1][96],
harq_process->c[r-1],
Kr_last,
f1f2mat_old[iind*2],
f1f2mat_old[(iind*2)+1],
dlsch->max_turbo_iterations,
crc_type,
(r==0) ? harq_process->F : 0,
&phy_vars_ue->dlsch_tc_init_stats,
&phy_vars_ue->dlsch_tc_alpha_stats,
&phy_vars_ue->dlsch_tc_beta_stats,
&phy_vars_ue->dlsch_tc_gamma_stats,
&phy_vars_ue->dlsch_tc_ext_stats,
&phy_vars_ue->dlsch_tc_intl1_stats,
&phy_vars_ue->dlsch_tc_intl2_stats); //(is_crnti==0)?harq_pid:harq_pid+1);
stop_meas(dlsch_turbo_decoding_stats);
start_meas(dlsch_turbo_decoding_stats);
ret = tc
(&harq_process->d[r][96],
harq_process->c[r],
Kr,
f1f2mat_old[iind*2],
f1f2mat_old[(iind*2)+1],
dlsch->max_turbo_iterations,
crc_type,
(r==0) ? harq_process->F : 0,
&phy_vars_ue->dlsch_tc_init_stats,
&phy_vars_ue->dlsch_tc_alpha_stats,
&phy_vars_ue->dlsch_tc_beta_stats,
&phy_vars_ue->dlsch_tc_gamma_stats,
&phy_vars_ue->dlsch_tc_ext_stats,
&phy_vars_ue->dlsch_tc_intl1_stats,
&phy_vars_ue->dlsch_tc_intl2_stats); //(is_crnti==0)?harq_pid:harq_pid+1);
stop_meas(dlsch_turbo_decoding_stats);
}
}
}
#endif
if ((err_flag == 0) && (ret>=(1+dlsch->max_turbo_iterations))) {// a Code segment is in error so break; if ((err_flag == 0) && (ret>=(1+dlsch->max_turbo_iterations))) {// a Code segment is in error so break;
......
...@@ -1898,17 +1898,17 @@ void dlsch_channel_compensation_TM3(LTE_DL_FRAME_PARMS *frame_parms, ...@@ -1898,17 +1898,17 @@ void dlsch_channel_compensation_TM3(LTE_DL_FRAME_PARMS *frame_parms,
for (aarx=0; aarx<frame_parms->nb_antennas_rx; aarx++) { for (aarx=0; aarx<frame_parms->nb_antennas_rx; aarx++) {
dl_ch0_128 = (__m128i *)&dl_ch_estimates_ext[aarx][symbol*frame_parms->N_RB_DL*12]; dl_ch0_128 = (__m128i *)&dl_ch_estimates_ext[aarx][symbol*frame_parms->N_RB_DL*12]; // hr,0
dl_ch1_128 = (__m128i *)&dl_ch_estimates_ext[2+aarx][symbol*frame_parms->N_RB_DL*12]; dl_ch1_128 = (__m128i *)&dl_ch_estimates_ext[2+aarx][symbol*frame_parms->N_RB_DL*12]; // hr,1
dl_ch_mag0_128 = (__m128i *)&dl_ch_mag0[aarx][symbol*frame_parms->N_RB_DL*12]; dl_ch_mag0_128 = (__m128i *)&dl_ch_mag0[aarx][symbol*frame_parms->N_RB_DL*12];
dl_ch_mag0_128b = (__m128i *)&dl_ch_magb0[aarx][symbol*frame_parms->N_RB_DL*12]; dl_ch_mag0_128b = (__m128i *)&dl_ch_magb0[aarx][symbol*frame_parms->N_RB_DL*12];
dl_ch_mag1_128 = (__m128i *)&dl_ch_mag1[aarx][symbol*frame_parms->N_RB_DL*12]; dl_ch_mag1_128 = (__m128i *)&dl_ch_mag1[aarx][symbol*frame_parms->N_RB_DL*12];
dl_ch_mag1_128b = (__m128i *)&dl_ch_magb1[aarx][symbol*frame_parms->N_RB_DL*12]; dl_ch_mag1_128b = (__m128i *)&dl_ch_magb1[aarx][symbol*frame_parms->N_RB_DL*12];
rxdataF128 = (__m128i *)&rxdataF_ext[aarx][symbol*frame_parms->N_RB_DL*12]; rxdataF128 = (__m128i *)&rxdataF_ext[aarx][symbol*frame_parms->N_RB_DL*12]; // yr
rxdataF_comp0_128 = (__m128i *)&rxdataF_comp0[aarx][symbol*frame_parms->N_RB_DL*12]; rxdataF_comp0_128 = (__m128i *)&rxdataF_comp0[aarx][symbol*frame_parms->N_RB_DL*12]; // yr,0 = yr * conj(hr,0)
rxdataF_comp1_128 = (__m128i *)&rxdataF_comp1[aarx][symbol*frame_parms->N_RB_DL*12]; rxdataF_comp1_128 = (__m128i *)&rxdataF_comp1[aarx][symbol*frame_parms->N_RB_DL*12]; // yr,1 = yr * conj(hr,1)
for (rb=0; rb<nb_rb; rb++) { for (rb=0; rb<nb_rb; rb++) {
......
...@@ -608,7 +608,7 @@ int32_t generate_prach( PHY_VARS_UE *phy_vars_ue, uint8_t eNB_id, uint8_t subfra ...@@ -608,7 +608,7 @@ int32_t generate_prach( PHY_VARS_UE *phy_vars_ue, uint8_t eNB_id, uint8_t subfra
uint8_t preamble_index = phy_vars_ue->prach_resources[eNB_id]->ra_PreambleIndex; uint8_t preamble_index = phy_vars_ue->prach_resources[eNB_id]->ra_PreambleIndex;
uint8_t tdd_mapindex = phy_vars_ue->prach_resources[eNB_id]->ra_TDD_map_index; uint8_t tdd_mapindex = phy_vars_ue->prach_resources[eNB_id]->ra_TDD_map_index;
int16_t *prachF = phy_vars_ue->lte_ue_prach_vars[eNB_id]->prachF; int16_t *prachF = phy_vars_ue->lte_ue_prach_vars[eNB_id]->prachF;
static int16_t prach_tmp[45600*2] __attribute__((aligned(16))); static int16_t prach_tmp[45600*2] __attribute__((aligned(32)));
int16_t *prach = prach_tmp; int16_t *prach = prach_tmp;
int16_t *prach2; int16_t *prach2;
int16_t amp = phy_vars_ue->lte_ue_prach_vars[eNB_id]->amp; int16_t amp = phy_vars_ue->lte_ue_prach_vars[eNB_id]->amp;
......
...@@ -91,7 +91,7 @@ void PHY_ofdm_mod(int *input, /// pointer to complex input ...@@ -91,7 +91,7 @@ void PHY_ofdm_mod(int *input, /// pointer to complex input
) )
{ {
static short temp[2048*4] __attribute__((aligned(16))); static short temp[2048*4] __attribute__((aligned(32)));
unsigned short i,j; unsigned short i,j;
short k; short k;
...@@ -143,9 +143,18 @@ void PHY_ofdm_mod(int *input, /// pointer to complex input ...@@ -143,9 +143,18 @@ void PHY_ofdm_mod(int *input, /// pointer to complex input
printf("[PHY] symbol %d/%d offset %d (%p,%p -> %p)\n",i,nb_symbols,i*fftsize+(i*nb_prefix_samples),input,&input[i*fftsize],&output[(i*fftsize) + ((i)*nb_prefix_samples)]); printf("[PHY] symbol %d/%d offset %d (%p,%p -> %p)\n",i,nb_symbols,i*fftsize+(i*nb_prefix_samples),input,&input[i*fftsize],&output[(i*fftsize) + ((i)*nb_prefix_samples)]);
#endif #endif
#ifndef __AVX2__
// handle 128-bit alignment for 128-bit SIMD (SSE4,NEON,AltiVEC)
idft((int16_t *)&input[i*fftsize], idft((int16_t *)&input[i*fftsize],
(fftsize==128) ? (int16_t *)temp : (int16_t *)&output[(i*fftsize) + ((1+i)*nb_prefix_samples)], (fftsize==128) ? (int16_t *)temp : (int16_t *)&output[(i*fftsize) + ((1+i)*nb_prefix_samples)],
1); 1);
#else
// on AVX2 need 256-bit alignment
idft((int16_t *)&input[i*fftsize],
(fftsize<=512) ? (int16_t *)temp : (int16_t *)&output[(i*fftsize) + ((1+i)*nb_prefix_samples)],
1);
#endif
// Copy to frame buffer with Cyclic Extension // Copy to frame buffer with Cyclic Extension
// Note: will have to adjust for synchronization offset! // Note: will have to adjust for synchronization offset!
...@@ -158,7 +167,12 @@ void PHY_ofdm_mod(int *input, /// pointer to complex input ...@@ -158,7 +167,12 @@ void PHY_ofdm_mod(int *input, /// pointer to complex input
// msg("Doing cyclic prefix method\n"); // msg("Doing cyclic prefix method\n");
if (fftsize==128) { #ifndef __AVX2__
if (fftsize==128)
#else
if (fftsize<=512)
#endif
{
for (j=0; j<fftsize ; j++) { for (j=0; j<fftsize ; j++) {
output_ptr[j] = temp_ptr[j]; output_ptr[j] = temp_ptr[j];
} }
......
...@@ -56,7 +56,7 @@ int slot_fep(PHY_VARS_UE *phy_vars_ue, ...@@ -56,7 +56,7 @@ int slot_fep(PHY_VARS_UE *phy_vars_ue,
unsigned int rx_offset; unsigned int rx_offset;
void (*dft)(int16_t *,int16_t *, int); void (*dft)(int16_t *,int16_t *, int);
int tmp_dft_in[256]; // This is for misalignment issues for 6 and 15 PRBs int tmp_dft_in[2048]; // This is for misalignment issues for 6 and 15 PRBs
switch (frame_parms->ofdm_symbol_size) { switch (frame_parms->ofdm_symbol_size) {
case 128: case 128:
...@@ -115,8 +115,8 @@ int slot_fep(PHY_VARS_UE *phy_vars_ue, ...@@ -115,8 +115,8 @@ int slot_fep(PHY_VARS_UE *phy_vars_ue,
memset(&ue_common_vars->rxdataF[aa][frame_parms->ofdm_symbol_size*symbol],0,frame_parms->ofdm_symbol_size*sizeof(int)); memset(&ue_common_vars->rxdataF[aa][frame_parms->ofdm_symbol_size*symbol],0,frame_parms->ofdm_symbol_size*sizeof(int));
rx_offset = sample_offset + slot_offset + nb_prefix_samples0 + subframe_offset - SOFFSET; rx_offset = sample_offset + slot_offset + nb_prefix_samples0 + subframe_offset - SOFFSET;
// Align with 128 bit // Align with 256 bit
rx_offset = rx_offset - rx_offset % 4; // rx_offset = rx_offset&0xfffffff8;
#ifdef DEBUG_FEP #ifdef DEBUG_FEP
// if (phy_vars_ue->frame <100) // if (phy_vars_ue->frame <100)
...@@ -131,9 +131,9 @@ int slot_fep(PHY_VARS_UE *phy_vars_ue, ...@@ -131,9 +131,9 @@ int slot_fep(PHY_VARS_UE *phy_vars_ue,
(short *)&ue_common_vars->rxdata[aa][0], (short *)&ue_common_vars->rxdata[aa][0],
frame_parms->ofdm_symbol_size*sizeof(int)); frame_parms->ofdm_symbol_size*sizeof(int));
if ((rx_offset&3)!=0) { // if input to dft is not 128-bit aligned, issue for size 6 and 15 PRBs if ((rx_offset&7)!=0) { // if input to dft is not 256-bit aligned, issue for size 6,15 and 25 PRBs
memcpy((void *)tmp_dft_in, memcpy((void *)tmp_dft_in,
(void *)&ue_common_vars->rxdata[aa][(rx_offset-nb_prefix_samples0) % frame_length_samples], (void *)&ue_common_vars->rxdata[aa][rx_offset % frame_length_samples],
frame_parms->ofdm_symbol_size*sizeof(int)); frame_parms->ofdm_symbol_size*sizeof(int));
dft((int16_t *)tmp_dft_in, dft((int16_t *)tmp_dft_in,
(int16_t *)&ue_common_vars->rxdataF[aa][frame_parms->ofdm_symbol_size*symbol],1); (int16_t *)&ue_common_vars->rxdataF[aa][frame_parms->ofdm_symbol_size*symbol],1);
...@@ -146,8 +146,8 @@ int slot_fep(PHY_VARS_UE *phy_vars_ue, ...@@ -146,8 +146,8 @@ int slot_fep(PHY_VARS_UE *phy_vars_ue,
} }
} else { } else {
rx_offset += (frame_parms->ofdm_symbol_size+nb_prefix_samples) + rx_offset += (frame_parms->ofdm_symbol_size+nb_prefix_samples)*l;// +
(frame_parms->ofdm_symbol_size+nb_prefix_samples)*(l-1); // (frame_parms->ofdm_symbol_size+nb_prefix_samples)*(l-1);
#ifdef DEBUG_FEP #ifdef DEBUG_FEP
// if (phy_vars_ue->frame <100) // if (phy_vars_ue->frame <100)
...@@ -162,7 +162,7 @@ int slot_fep(PHY_VARS_UE *phy_vars_ue, ...@@ -162,7 +162,7 @@ int slot_fep(PHY_VARS_UE *phy_vars_ue,
start_meas(&phy_vars_ue->rx_dft_stats); start_meas(&phy_vars_ue->rx_dft_stats);
if ((rx_offset&3)!=0) { // if input to dft is not 128-bit aligned, issue for size 6 and 15 PRBs if ((rx_offset&7)!=0) { // if input to dft is not 128-bit aligned, issue for size 6 and 15 PRBs
memcpy((void *)tmp_dft_in, memcpy((void *)tmp_dft_in,
(void *)&ue_common_vars->rxdata[aa][(rx_offset) % frame_length_samples], (void *)&ue_common_vars->rxdata[aa][(rx_offset) % frame_length_samples],
frame_parms->ofdm_symbol_size*sizeof(int)); frame_parms->ofdm_symbol_size*sizeof(int));
......
lte_dfts: lte_dfts.c lte_dfts_sse4: lte_dfts.c
gcc -O2 -mavx2 -g -ggdb -o lte_dfts lte_dfts.c time_meas.c file_output.c ../../SIMULATION/TOOLS/taus.c -I$$OPENAIR1_DIR -I$$OPENAIR_TARGETS -I$$OPENAIR2_DIR/COMMON -DUSER_MODE -DMR_MAIN -DNB_ANTENNAS_RX=1 # -DD256STATS #-DD64STATS gcc -O2 -msse4.1 -g -ggdb -o lte_dfts_sse4 lte_dfts.c time_meas.c file_output.c ../../SIMULATION/TOOLS/taus.c -I$$OPENAIR1_DIR -I$$OPENAIR_TARGETS -I$$OPENAIR2_DIR/COMMON -DUSER_MODE -DMR_MAIN -DNB_ANTENNAS_RX=1 # -DD256STATS #-DD64STATS
lte_dfts.s: lte_dfts.c lte_dfts_avx2: lte_dfts.c
gcc -O2 -mavx2 -g -ggdb -o lte_dfts_avx2 lte_dfts.c time_meas.c file_output.c ../../SIMULATION/TOOLS/taus.c -I$$OPENAIR1_DIR -I$$OPENAIR_TARGETS -I$$OPENAIR2_DIR/COMMON -DUSER_MODE -DMR_MAIN -DNB_ANTENNAS_RX=1 # -DD256STATS #-DD64STATS
lte_dfts_avx2.s: lte_dfts.c
gcc -O2 -mavx2 -S lte_dfts.c time_meas.c file_output.c ../../SIMULATION/TOOLS/taus.c -I$$OPENAIR1_DIR -I$$OPENAIR_TARGETS -I$$OPENAIR2_DIR/COMMON -DUSER_MODE -DMR_MAIN -DNB_ANTENNAS_RX=1 # -DD256STATS #-DD64STATS gcc -O2 -mavx2 -S lte_dfts.c time_meas.c file_output.c ../../SIMULATION/TOOLS/taus.c -I$$OPENAIR1_DIR -I$$OPENAIR_TARGETS -I$$OPENAIR2_DIR/COMMON -DUSER_MODE -DMR_MAIN -DNB_ANTENNAS_RX=1 # -DD256STATS #-DD64STATS
dft_cycles: lte_dfts lte_dfts_sse4.s: lte_dfts.c
./lte_dfts | egrep cycles gcc -O2 -msse4.1 -S lte_dfts.c time_meas.c file_output.c ../../SIMULATION/TOOLS/taus.c -I$$OPENAIR1_DIR -I$$OPENAIR_TARGETS -I$$OPENAIR2_DIR/COMMON -DUSER_MODE -DMR_MAIN -DNB_ANTENNAS_RX=1 # -DD256STATS #-DD64STATS
dft_cycles_avx2: lte_dfts_avx2
./lte_dfts_avx2 | egrep cycles
This source diff could not be displayed because it is too large. You can view the blob instead.
/* Twiddles generated with
twa = floor(32767*exp(-sqrt(-1)*2*pi*(0:4095)/8192));
twa2 = zeros(1,2*4096);
twa2(1:2:end) = real(twa);
twa2(2:2:end) = imag(twa);
fd=fopen("twiddle_tmp.txt","w");
fprintf(fd,"static int16_t tw8192[4096*2] = {");
fprintf(fd,"%d,",twa2(1:(4096*2)-1));
fprintf(fd,"%d};\n",twa2(end));
fclose(fd);
*/
static int16_t tw8192[4096*2] = {32767,0,32766,-26,32766,-51,32766,-76,32766,-101,32766,-126,32766,-151,32766,-176,32766,-202,32766,-227,32766,-252,32765,-277,32765,-302,32765,-327,32765,-352,32764,-377,32764,-403,32764,-428,32763,-453,32763,-478,32763,-503,32762,-528,32762,-553,32761,-579,32761,-604,32760,-629,32760,-654,32759,-679,32759,-704,32758,-729,32758,-754,32757,-780,32757,-805,32756,-830,32755,-855,32755,-880,32754,-905,32753,-930,32753,-955,32752,-981,32751,-1006,32750,-1031,32750,-1056,32749,-1081,32748,-1106,32747,-1131,32746,-1156,32745,-1181,32744,-1207,32743,-1232,32742,-1257,32741,-1282,32740,-1307,32739,-1332,32738,-1357,32737,-1382,32736,-1407,32735,-1433,32734,-1458,32733,-1483,32732,-1508,32731,-1533,32729,-1558,32728,-1583,32727,-1608,32726,-1633,32725,-1659,32723,-1684,32722,-1709,32721,-1734,32719,-1759,32718,-1784,32717,-1809,32715,-1834,32714,-1859,32712,-1884,32711,-1909,32709,-1935,32708,-1960,32706,-1985,32705,-2010,32703,-2035,32702,-2060,32700,-2085,32699,-2110,32697,-2135,32695,-2160,32694,-2185,32692,-2210,32690,-2236,32688,-2261,32687,-2286,32685,-2311,32683,-2336,32681,-2361,32680,-2386,32678,-2411,32676,-2436,32674,-2461,32672,-2486,32670,-2511,32668,-2536,32666,-2561,32664,-2586,32662,-2611,32660,-2637,32658,-2662,32656,-2687,32654,-2712,32652,-2737,32650,-2762,32648,-2787,32646,-2812,32644,-2837,32641,-2862,32639,-2887,32637,-2912,32635,-2937,32632,-2962,32630,-2987,32628,-3012,32625,-3037,32623,-3062,32621,-3087,32618,-3112,32616,-3137,32614,-3162,32611,-3187,32609,-3212,32606,-3237,32604,-3262,32601,-3287,32599,-3312,32596,-3337,32594,-3362,32591,-3387,32588,-3412,32586,-3437,32583,-3462,32580,-3487,32578,-3512,32575,-3537,32572,-3562,32570,-3587,32567,-3612,32564,-3637,32561,-3662,32558,-3687,32556,-3712,32553,-3737,32550,-3762,32547,-3787,32544,-3812,32541,-3837,32538,-3862,32535,-3887,32532,-3912,32529,-3937,32526,-3962,32523,-3987,32520,-4012,32517,-4036,32514,-4061,32511,-4086,32508,-4111,32504,-4136,32501,-4161,32498,-4186,32495,-4211,32492,-4236,32488,-4261,32485,-4286,32482,-4311,32478,-4336,32475,-4360,32472,-4385,32468,-4410,32465,-4435,32462,-4460,32458,-4485,32455,-4510,32451,-4535,32448,-4560,32444,-4585,32441,-4609,32437,-4634,32434,-4659,32430,-4684,32426,-4709,32423,-4734,32419,-4759,32416,-4784,32412,-4808,32408,-4833,32404,-4858,32401,-4883,32397,-4908,32393,-4933,32389,-4958,32386,-4982,32382,-5007,32378,-5032,32374,-5057,32370,-5082,32366,-5107,32362,-5131,32358,-5156,32354,-5181,32350,-5206,32346,-5231,32342,-5255,32338,-5280,32334,-5305,32330,-5330,32326,-5355,32322,-5379,32318,-5404,32314,-5429,32310,-5454,32305,-5479,32301,-5503,32297,-5528,32293,-5553,32288,-5578,32284,-5602,32280,-5627,32275,-5652,32271,-5677,32267,-5701,32262,-5726,32258,-5751,32254,-5776,32249,-5800,32245,-5825,32240,-5850,32236,-5875,32231,-5899,32227,-5924,32222,-5949,32218,-5973,32213,-5998,32208,-6023,32204,-6048,32199,-6072,32194,-6097,32190,-6122,32185,-6146,32180,-6171,32176,-6196,32171,-6220,32166,-6245,32161,-6270,32156,-6294,32152,-6319,32147,-6344,32142,-6368,32137,-6393,32132,-6418,32127,-6442,32122,-6467,32117,-6492,32112,-6516,32107,-6541,32102,-6565,32097,-6590,32092,-6615,32087,-6639,32082,-6664,32077,-6689,32072,-6713,32066,-6738,32061,-6762,32056,-6787,32051,-6812,32046,-6836,32040,-6861,32035,-6885,32030,-6910,32024,-6934,32019,-6959,32014,-6983,32008,-7008,32003,-7033,31998,-7057,31992,-7082,31987,-7106,31981,-7131,31976,-7155,31970,-7180,31965,-7204,31959,-7229,31954,-7253,31948,-7278,31943,-7302,31937,-7327,31931,-7351,31926,-7376,31920,-7400,31914,-7425,31909,-7449,31903,-7474,31897,-7498,31891,-7523,31886,-7547,31880,-7572,31874,-7596,31868,-7620,31862,-7645,31856,-7669,31851,-7694,31845,-7718,31839,-7743,31833,-7767,31827,-7791,31821,-7816,31815,-7840,31809,-7865,31803,-7889,31797,-7913,31791,-7938,31785,-7962,31778,-7987,31772,-8011,31766,-8035,31760,-8060,31754,-8084,31748,-8108,31741,-8133,31735,-8157,31729,-8181,31723,-8206,31716,-8230,31710,-8254,31704,-8279,31697,-8303,31691,-8327,31684,-8352,31678,-8376,31672,-8400,31665,-8425,31659,-8449,31652,-8473,31646,-8497,31639,-8522,31633,-8546,31626,-8570,31619,-8594,31613,-8619,31606,-8643,31600,-8667,31593,-8691,31586,-8716,31580,-8740,31573,-8764,31566,-8788,31559,-8813,31553,-8837,31546,-8861,31539,-8885,31532,-8909,31525,-8933,31518,-8958,31512,-8982,31505,-9006,31498,-9030,31491,-9054,31484,-9078,31477,-9103,31470,-9127,31463,-9151,31456,-9175,31449,-9199,31442,-9223,31435,-9247,31428,-9271,31420,-9296,31413,-9320,31406,-9344,31399,-9368,31392,-9392,31385,-9416,31377,-9440,31370,-9464,31363,-9488,31356,-9512,31348,-9536,31341,-9560,31334,-9584,31326,-9608,31319,-9632,31311,-9656,31304,-9680,31297,-9704,31289,-9728,31282,-9752,31274,-9776,31267,-9800,31259,-9824,31252,-9848,31244,-9872,31236,-9896,31229,-9920,31221,-9944,31214,-9968,31206,-9992,31198,-10016,31191,-10040,31183,-10064,31175,-10088,31167,-10112,31160,-10136,31152,-10160,31144,-10183,31136,-10207,31128,-10231,31121,-10255,31113,-10279,31105,-10303,31097,-10327,31089,-10350,31081,-10374,31073,-10398,31065,-10422,31057,-10446,31049,-10470,31041,-10493,31033,-10517,31025,-10541,31017,-10565,31009,-10589,31001,-10612,30992,-10636,30984,-10660,30976,-10684,30968,-10707,30960,-10731,30951,-10755,30943,-10779,30935,-10802,30927,-10826,30918,-10850,30910,-10874,30902,-10897,30893,-10921,30885,-10945,30876,-10968,30868,-10992,30860,-11016,30851,-11039,30843,-11063,30834,-11087,30826,-11110,30817,-11134,30809,-11158,30800,-11181,30791,-11205,30783,-11228,30774,-11252,30766,-11276,30757,-11299,30748,-11323,30739,-11346,30731,-11370,30722,-11394,30713,-11417,30705,-11441,30696,-11464,30687,-11488,30678,-11511,30669,-11535,30660,-11558,30652,-11582,30643,-11605,30634,-11629,30625,-11652,30616,-11676,30607,-11699,30598,-11723,30589,-11746,30580,-11770,30571,-11793,30562,-11817,30553,-11840,30544,-11863,30535,-11887,30525,-11910,30516,-11934,30507,-11957,30498,-11981,30489,-12004,30480,-12027,30470,-12051,30461,-12074,30452,-12097,30442,-12121,30433,-12144,30424,-12167,30415,-12191,30405,-12214,30396,-12237,30386,-12261,30377,-12284,30368,-12307,30358,-12331,30349,-12354,30339,-12377,30330,-12400,30320,-12424,30311,-12447,30301,-12470,30291,-12493,30282,-12517,30272,-12540,30263,-12563,30253,-12586,30243,-12610,30234,-12633,30224,-12656,30214,-12679,30205,-12702,30195,-12725,30185,-12749,30175,-12772,30165,-12795,30156,-12818,30146,-12841,30136,-12864,30126,-12887,30116,-12910,30106,-12934,30096,-12957,30086,-12980,30076,-13003,30066,-13026,30056,-13049,30046,-13072,30036,-13095,30026,-13118,30016,-13141,30006,-13164,29996,-13187,29986,-13210,29976,-13233,29966,-13256,29955,-13279,29945,-13302,29935,-13325,29925,-13348,29915,-13371,29904,-13394,29894,-13417,29884,-13440,29873,-13463,29863,-13486,29853,-13508,29842,-13531,29832,-13554,29822,-13577,29811,-13600,29801,-13623,29790,-13646,29780,-13668,29769,-13691,29759,-13714,29748,-13737,29738,-13760,29727,-13783,29717,-13805,29706,-13828,29695,-13851,29685,-13874,29674,-13896,29663,-13919,29653,-13942,29642,-13965,29631,-13987,29621,-14010,29610,-14033,29599,-14056,29588,-14078,29577,-14101,29567,-14124,29556,-14146,29545,-14169,29534,-14192,29523,-14214,29512,-14237,29501,-14260,29490,-14282,29479,-14305,29468,-14327,29457,-14350,29446,-14373,29435,-14395,29424,-14418,29413,-14440,29402,-14463,29391,-14485,29380,-14508,29369,-14531,29358,-14553,29346,-14576,29335,-14598,29324,-14621,29313,-14643,29302,-14666,29290,-14688,29279,-14710,29268,-14733,29256,-14755,29245,-14778,29234,-14800,29222,-14823,29211,-14845,29200,-14867,29188,-14890,29177,-14912,29165,-14935,29154,-14957,29142,-14979,29131,-15002,29119,-15024,29108,-15046,29096,-15069,29085,-15091,29073,-15113,29062,-15136,29050,-15158,29038,-15180,29027,-15202,29015,-15225,29003,-15247,28992,-15269,28980,-15291,28968,-15314,28956,-15336,28945,-15358,28933,-15380,28921,-15402,28909,-15425,28897,-15447,28886,-15469,28874,-15491,28862,-15513,28850,-15535,28838,-15557,28826,-15580,28814,-15602,28802,-15624,28790,-15646,28778,-15668,28766,-15690,28754,-15712,28742,-15734,28730,-15756,28718,-15778,28706,-15800,28694,-15822,28681,-15844,28669,-15866,28657,-15888,28645,-15910,28633,-15932,28620,-15954,28608,-15976,28596,-15998,28584,-16020,28571,-16042,28559,-16064,28547,-16086,28534,-16108,28522,-16129,28510,-16151,28497,-16173,28485,-16195,28472,-16217,28460,-16239,28447,-16261,28435,-16282,28423,-16304,28410,-16326,28397,-16348,28385,-16369,28372,-16391,28360,-16413,28347,-16435,28335,-16456,28322,-16478,28309,-16500,28297,-16522,28284,-16543,28271,-16565,28259,-16587,28246,-16608,28233,-16630,28220,-16652,28208,-16673,28195,-16695,28182,-16717,28169,-16738,28156,-16760,28143,-16781,28131,-16803,28118,-16825,28105,-16846,28092,-16868,28079,-16889,28066,-16911,28053,-16932,28040,-16954,28027,-16975,28014,-16997,28001,-17018,27988,-17040,27975,-17061,27962,-17083,27948,-17104,27935,-17125,27922,-17147,27909,-17168,27896,-17190,27883,-17211,27869,-17233,27856,-17254,27843,-17275,27830,-17297,27816,-17318,27803,-17339,27790,-17361,27777,-17382,27763,-17403,27750,-17424,27736,-17446,27723,-17467,27710,-17488,27696,-17510,27683,-17531,27669,-17552,27656,-17573,27642,-17594,27629,-17616,27615,-17637,27602,-17658,27588,-17679,27575,-17700,27561,-17721,27548,-17743,27534,-17764,27520,-17785,27507,-17806,27493,-17827,27479,-17848,27466,-17869,27452,-17890,27438,-17911,27424,-17932,27411,-17953,27397,-17974,27383,-17995,27369,-18016,27355,-18037,27342,-18058,27328,-18079,27314,-18100,27300,-18121,27286,-18142,27272,-18163,27258,-18184,27244,-18205,27230,-18226,27216,-18247,27202,-18268,27188,-18288,27174,-18309,27160,-18330,27146,-18351,27132,-18372,27118,-18393,27104,-18413,27090,-18434,27076,-18455,27061,-18476,27047,-18496,27033,-18517,27019,-18538,27005,-18559,26990,-18579,26976,-18600,26962,-18621,26948,-18641,26933,-18662,26919,-18683,26905,-18703,26890,-18724,26876,-18745,26861,-18765,26847,-18786,26833,-18806,26818,-18827,26804,-18847,26789,-18868,26775,-18889,26760,-18909,26746,-18930,26731,-18950,26717,-18971,26702,-18991,26688,-19012,26673,-19032,26658,-19052,26644,-19073,26629,-19093,26615,-19114,26600,-19134,26585,-19155,26570,-19175,26556,-19195,26541,-19216,26526,-19236,26512,-19256,26497,-19277,26482,-19297,26467,-19317,26452,-19338,26437,-19358,26423,-19378,26408,-19398,26393,-19419,26378,-19439,26363,-19459,26348,-19479,26333,-19500,26318,-19520,26303,-19540,26288,-19560,26273,-19580,26258,-19600,26243,-19621,26228,-19641,26213,-19661,26198,-19681,26183,-19701,26168,-19721,26153,-19741,26137,-19761,26122,-19781,26107,-19801,26092,-19821,26077,-19841,26061,-19861,26046,-19881,26031,-19901,26016,-19921,26000,-19941,25985,-19961,25970,-19981,25954,-20001,25939,-20021,25924,-20041,25908,-20061,25893,-20080,25878,-20100,25862,-20120,25847,-20140,25831,-20160,25816,-20180,25800,-20199,25785,-20219,25769,-20239,25754,-20259,25738,-20278,25723,-20298,25707,-20318,25691,-20338,25676,-20357,25660,-20377,25645,-20397,25629,-20416,25613,-20436,25598,-20456,25582,-20475,25566,-20495,25550,-20514,25535,-20534,25519,-20554,25503,-20573,25487,-20593,25472,-20612,25456,-20632,25440,-20651,25424,-20671,25408,-20690,25392,-20710,25376,-20729,25361,-20749,25345,-20768,25329,-20788,25313,-20807,25297,-20826,25281,-20846,25265,-20865,25249,-20885,25233,-20904,25217,-20923,25201,-20943,25185,-20962,25169,-20981,25152,-21001,25136,-21020,25120,-21039,25104,-21058,25088,-21078,25072,-21097,25056,-21116,25039,-21135,25023,-21155,25007,-21174,24991,-21193,24974,-21212,24958,-21231,24942,-21250,24926,-21269,24909,-21289,24893,-21308,24877,-21327,24860,-21346,24844,-21365,24827,-21384,24811,-21403,24795,-21422,24778,-21441,24762,-21460,24745,-21479,24729,-21498,24712,-21517,24696,-21536,24679,-21555,24663,-21574,24646,-21593,24630,-21612,24613,-21630,24596,-21649,24580,-21668,24563,-21687,24546,-21706,24530,-21725,24513,-21744,24496,-21762,24480,-21781,24463,-21800,24446,-21819,24430,-21837,24413,-21856,24396,-21875,24379,-21894,24362,-21912,24346,-21931,24329,-21950,24312,-21968,24295,-21987,24278,-22005,24261,-22024,24244,-22043,24228,-22061,24211,-22080,24194,-22098,24177,-22117,24160,-22136,24143,-22154,24126,-22173,24109,-22191,24092,-22210,24075,-22228,24058,-22246,24041,-22265,24023,-22283,24006,-22302,23989,-22320,23972,-22339,23955,-22357,23938,-22375,23921,-22394,23903,-22412,23886,-22430,23869,-22449,23852,-22467,23835,-22485,23817,-22504,23800,-22522,23783,-22540,23766,-22558,23748,-22576,23731,-22595,23714,-22613,23696,-22631,23679,-22649,23661,-22667,23644,-22686,23627,-22704,23609,-22722,23592,-22740,23574,-22758,23557,-22776,23539,-22794,23522,-22812,23504,-22830,23487,-22848,23469,-22866,23452,-22884,23434,-22902,23417,-22920,23399,-22938,23382,-22956,23364,-22974,23346,-22992,23329,-23010,23311,-23028,23293,-23046,23276,-23063,23258,-23081,23240,-23099,23223,-23117,23205,-23135,23187,-23152,23169,-23170,23151,-23188,23134,-23206,23116,-23224,23098,-23241,23080,-23259,23062,-23277,23045,-23294,23027,-23312,23009,-23330,22991,-23347,22973,-23365,22955,-23383,22937,-23400,22919,-23418,22901,-23435,22883,-23453,22865,-23470,22847,-23488,22829,-23505,22811,-23523,22793,-23540,22775,-23558,22757,-23575,22739,-23593,22721,-23610,22703,-23628,22685,-23645,22666,-23662,22648,-23680,22630,-23697,22612,-23715,22594,-23732,22575,-23749,22557,-23767,22539,-23784,22521,-23801,22503,-23818,22484,-23836,22466,-23853,22448,-23870,22429,-23887,22411,-23904,22393,-23922,22374,-23939,22356,-23956,22338,-23973,22319,-23990,22301,-24007,22282,-24024,22264,-24042,22245,-24059,22227,-24076,22209,-24093,22190,-24110,22172,-24127,22153,-24144,22135,-24161,22116,-24178,22097,-24195,22079,-24212,22060,-24229,22042,-24245,22023,-24262,22004,-24279,21986,-24296,21967,-24313,21949,-24330,21930,-24347,21911,-24363,21893,-24380,21874,-24397,21855,-24414,21836,-24431,21818,-24447,21799,-24464,21780,-24481,21761,-24497,21743,-24514,21724,-24531,21705,-24547,21686,-24564,21667,-24581,21648,-24597,21629,-24614,21611,-24631,21592,-24647,21573,-24664,21554,-24680,21535,-24697,21516,-24713,21497,-24730,21478,-24746,21459,-24763,21440,-24779,21421,-24796,21402,-24812,21383,-24828,21364,-24845,21345,-24861,21326,-24878,21307,-24894,21288,-24910,21268,-24927,21249,-24943,21230,-24959,21211,-24975,21192,-24992,21173,-25008,21154,-25024,21134,-25040,21115,-25057,21096,-25073,21077,-25089,21057,-25105,21038,-25121,21019,-25137,21000,-25153,20980,-25170,20961,-25186,20942,-25202,20922,-25218,20903,-25234,20884,-25250,20864,-25266,20845,-25282,20825,-25298,20806,-25314,20787,-25330,20767,-25346,20748,-25362,20728,-25377,20709,-25393,20689,-25409,20670,-25425,20650,-25441,20631,-25457,20611,-25473,20592,-25488,20572,-25504,20553,-25520,20533,-25536,20513,-25551,20494,-25567,20474,-25583,20455,-25599,20435,-25614,20415,-25630,20396,-25646,20376,-25661,20356,-25677,20337,-25692,20317,-25708,20297,-25724,20277,-25739,20258,-25755,20238,-25770,20218,-25786,20198,-25801,20179,-25817,20159,-25832,20139,-25848,20119,-25863,20099,-25879,20079,-25894,20060,-25909,20040,-25925,20020,-25940,20000,-25955,19980,-25971,19960,-25986,19940,-26001,19920,-26017,19900,-26032,19880,-26047,19860,-26062,19840,-26078,19820,-26093,19800,-26108,19780,-26123,19760,-26138,19740,-26154,19720,-26169,19700,-26184,19680,-26199,19660,-26214,19640,-26229,19620,-26244,19599,-26259,19579,-26274,19559,-26289,19539,-26304,19519,-26319,19499,-26334,19478,-26349,19458,-26364,19438,-26379,19418,-26394,19397,-26409,19377,-26424,19357,-26438,19337,-26453,19316,-26468,19296,-26483,19276,-26498,19255,-26513,19235,-26527,19215,-26542,19194,-26557,19174,-26571,19154,-26586,19133,-26601,19113,-26616,19092,-26630,19072,-26645,19051,-26659,19031,-26674,19011,-26689,18990,-26703,18970,-26718,18949,-26732,18929,-26747,18908,-26761,18888,-26776,18867,-26790,18846,-26805,18826,-26819,18805,-26834,18785,-26848,18764,-26862,18744,-26877,18723,-26891,18702,-26906,18682,-26920,18661,-26934,18640,-26949,18620,-26963,18599,-26977,18578,-26991,18558,-27006,18537,-27020,18516,-27034,18495,-27048,18475,-27062,18454,-27077,18433,-27091,18412,-27105,18392,-27119,18371,-27133,18350,-27147,18329,-27161,18308,-27175,18287,-27189,18267,-27203,18246,-27217,18225,-27231,18204,-27245,18183,-27259,18162,-27273,18141,-27287,18120,-27301,18099,-27315,18078,-27329,18057,-27343,18036,-27356,18015,-27370,17994,-27384,17973,-27398,17952,-27412,17931,-27425,17910,-27439,17889,-27453,17868,-27467,17847,-27480,17826,-27494,17805,-27508,17784,-27521,17763,-27535,17742,-27549,17720,-27562,17699,-27576,17678,-27589,17657,-27603,17636,-27616,17615,-27630,17593,-27643,17572,-27657,17551,-27670,17530,-27684,17509,-27697,17487,-27711,17466,-27724,17445,-27737,17423,-27751,17402,-27764,17381,-27778,17360,-27791,17338,-27804,17317,-27817,17296,-27831,17274,-27844,17253,-27857,17232,-27870,17210,-27884,17189,-27897,17167,-27910,17146,-27923,17124,-27936,17103,-27949,17082,-27963,17060,-27976,17039,-27989,17017,-28002,16996,-28015,16974,-28028,16953,-28041,16931,-28054,16910,-28067,16888,-28080,16867,-28093,16845,-28106,16824,-28119,16802,-28132,16780,-28144,16759,-28157,16737,-28170,16716,-28183,16694,-28196,16672,-28209,16651,-28221,16629,-28234,16607,-28247,16586,-28260,16564,-28272,16542,-28285,16521,-28298,16499,-28310,16477,-28323,16455,-28336,16434,-28348,16412,-28361,16390,-28373,16368,-28386,16347,-28398,16325,-28411,16303,-28424,16281,-28436,16260,-28448,16238,-28461,16216,-28473,16194,-28486,16172,-28498,16150,-28511,16128,-28523,16107,-28535,16085,-28548,16063,-28560,16041,-28572,16019,-28585,15997,-28597,15975,-28609,15953,-28621,15931,-28634,15909,-28646,15887,-28658,15865,-28670,15843,-28682,15821,-28695,15799,-28707,15777,-28719,15755,-28731,15733,-28743,15711,-28755,15689,-28767,15667,-28779,15645,-28791,15623,-28803,15601,-28815,15579,-28827,15556,-28839,15534,-28851,15512,-28863,15490,-28875,15468,-28887,15446,-28898,15424,-28910,15401,-28922,15379,-28934,15357,-28946,15335,-28957,15313,-28969,15290,-28981,15268,-28993,15246,-29004,15224,-29016,15201,-29028,15179,-29039,15157,-29051,15135,-29063,15112,-29074,15090,-29086,15068,-29097,15045,-29109,15023,-29120,15001,-29132,14978,-29143,14956,-29155,14934,-29166,14911,-29178,14889,-29189,14866,-29201,14844,-29212,14822,-29223,14799,-29235,14777,-29246,14754,-29257,14732,-29269,14709,-29280,14687,-29291,14665,-29303,14642,-29314,14620,-29325,14597,-29336,14575,-29347,14552,-29359,14530,-29370,14507,-29381,14484,-29392,14462,-29403,14439,-29414,14417,-29425,14394,-29436,14372,-29447,14349,-29458,14326,-29469,14304,-29480,14281,-29491,14259,-29502,14236,-29513,14213,-29524,14191,-29535,14168,-29546,14145,-29557,14123,-29568,14100,-29578,14077,-29589,14055,-29600,14032,-29611,14009,-29622,13986,-29632,13964,-29643,13941,-29654,13918,-29664,13895,-29675,13873,-29686,13850,-29696,13827,-29707,13804,-29718,13782,-29728,13759,-29739,13736,-29749,13713,-29760,13690,-29770,13667,-29781,13645,-29791,13622,-29802,13599,-29812,13576,-29823,13553,-29833,13530,-29843,13507,-29854,13485,-29864,13462,-29874,13439,-29885,13416,-29895,13393,-29905,13370,-29916,13347,-29926,13324,-29936,13301,-29946,13278,-29956,13255,-29967,13232,-29977,13209,-29987,13186,-29997,13163,-30007,13140,-30017,13117,-30027,13094,-30037,13071,-30047,13048,-30057,13025,-30067,13002,-30077,12979,-30087,12956,-30097,12933,-30107,12909,-30117,12886,-30127,12863,-30137,12840,-30147,12817,-30157,12794,-30166,12771,-30176,12748,-30186,12724,-30196,12701,-30206,12678,-30215,12655,-30225,12632,-30235,12609,-30244,12585,-30254,12562,-30264,12539,-30273,12516,-30283,12492,-30292,12469,-30302,12446,-30312,12423,-30321,12399,-30331,12376,-30340,12353,-30350,12330,-30359,12306,-30369,12283,-30378,12260,-30387,12236,-30397,12213,-30406,12190,-30416,12166,-30425,12143,-30434,12120,-30443,12096,-30453,12073,-30462,12050,-30471,12026,-30481,12003,-30490,11980,-30499,11956,-30508,11933,-30517,11909,-30526,11886,-30536,11862,-30545,11839,-30554,11816,-30563,11792,-30572,11769,-30581,11745,-30590,11722,-30599,11698,-30608,11675,-30617,11651,-30626,11628,-30635,11604,-30644,11581,-30653,11557,-30661,11534,-30670,11510,-30679,11487,-30688,11463,-30697,11440,-30706,11416,-30714,11393,-30723,11369,-30732,11345,-30740,11322,-30749,11298,-30758,11275,-30767,11251,-30775,11227,-30784,11204,-30792,11180,-30801,11157,-30810,11133,-30818,11109,-30827,11086,-30835,11062,-30844,11038,-30852,11015,-30861,10991,-30869,10967,-30877,10944,-30886,10920,-30894,10896,-30903,10873,-30911,10849,-30919,10825,-30928,10801,-30936,10778,-30944,10754,-30952,10730,-30961,10706,-30969,10683,-30977,10659,-30985,10635,-30993,10611,-31002,10588,-31010,10564,-31018,10540,-31026,10516,-31034,10492,-31042,10469,-31050,10445,-31058,10421,-31066,10397,-31074,10373,-31082,10349,-31090,10326,-31098,10302,-31106,10278,-31114,10254,-31122,10230,-31129,10206,-31137,10182,-31145,10159,-31153,10135,-31161,10111,-31168,10087,-31176,10063,-31184,10039,-31192,10015,-31199,9991,-31207,9967,-31215,9943,-31222,9919,-31230,9895,-31237,9871,-31245,9847,-31253,9823,-31260,9799,-31268,9775,-31275,9751,-31283,9727,-31290,9703,-31298,9679,-31305,9655,-31312,9631,-31320,9607,-31327,9583,-31335,9559,-31342,9535,-31349,9511,-31357,9487,-31364,9463,-31371,9439,-31378,9415,-31386,9391,-31393,9367,-31400,9343,-31407,9319,-31414,9295,-31421,9270,-31429,9246,-31436,9222,-31443,9198,-31450,9174,-31457,9150,-31464,9126,-31471,9102,-31478,9077,-31485,9053,-31492,9029,-31499,9005,-31506,8981,-31513,8957,-31519,8932,-31526,8908,-31533,8884,-31540,8860,-31547,8836,-31554,8812,-31560,8787,-31567,8763,-31574,8739,-31581,8715,-31587,8690,-31594,8666,-31601,8642,-31607,8618,-31614,8593,-31620,8569,-31627,8545,-31634,8521,-31640,8496,-31647,8472,-31653,8448,-31660,8424,-31666,8399,-31673,8375,-31679,8351,-31685,8326,-31692,8302,-31698,8278,-31705,8253,-31711,8229,-31717,8205,-31724,8180,-31730,8156,-31736,8132,-31742,8107,-31749,8083,-31755,8059,-31761,8034,-31767,8010,-31773,7986,-31779,7961,-31786,7937,-31792,7912,-31798,7888,-31804,7864,-31810,7839,-31816,7815,-31822,7790,-31828,7766,-31834,7742,-31840,7717,-31846,7693,-31852,7668,-31857,7644,-31863,7619,-31869,7595,-31875,7571,-31881,7546,-31887,7522,-31892,7497,-31898,7473,-31904,7448,-31910,7424,-31915,7399,-31921,7375,-31927,7350,-31932,7326,-31938,7301,-31944,7277,-31949,7252,-31955,7228,-31960,7203,-31966,7179,-31971,7154,-31977,7130,-31982,7105,-31988,7081,-31993,7056,-31999,7032,-32004,7007,-32009,6982,-32015,6958,-32020,6933,-32025,6909,-32031,6884,-32036,6860,-32041,6835,-32047,6811,-32052,6786,-32057,6761,-32062,6737,-32067,6712,-32073,6688,-32078,6663,-32083,6638,-32088,6614,-32093,6589,-32098,6564,-32103,6540,-32108,6515,-32113,6491,-32118,6466,-32123,6441,-32128,6417,-32133,6392,-32138,6367,-32143,6343,-32148,6318,-32153,6293,-32157,6269,-32162,6244,-32167,6219,-32172,6195,-32177,6170,-32181,6145,-32186,6121,-32191,6096,-32195,6071,-32200,6047,-32205,6022,-32209,5997,-32214,5972,-32219,5948,-32223,5923,-32228,5898,-32232,5874,-32237,5849,-32241,5824,-32246,5799,-32250,5775,-32255,5750,-32259,5725,-32263,5700,-32268,5676,-32272,5651,-32276,5626,-32281,5601,-32285,5577,-32289,5552,-32294,5527,-32298,5502,-32302,5478,-32306,5453,-32311,5428,-32315,5403,-32319,5378,-32323,5354,-32327,5329,-32331,5304,-32335,5279,-32339,5254,-32343,5230,-32347,5205,-32351,5180,-32355,5155,-32359,5130,-32363,5106,-32367,5081,-32371,5056,-32375,5031,-32379,5006,-32383,4981,-32387,4957,-32390,4932,-32394,4907,-32398,4882,-32402,4857,-32405,4832,-32409,4807,-32413,4783,-32417,4758,-32420,4733,-32424,4708,-32427,4683,-32431,4658,-32435,4633,-32438,4608,-32442,4584,-32445,4559,-32449,4534,-32452,4509,-32456,4484,-32459,4459,-32463,4434,-32466,4409,-32469,4384,-32473,4359,-32476,4335,-32479,4310,-32483,4285,-32486,4260,-32489,4235,-32493,4210,-32496,4185,-32499,4160,-32502,4135,-32505,4110,-32509,4085,-32512,4060,-32515,4035,-32518,4011,-32521,3986,-32524,3961,-32527,3936,-32530,3911,-32533,3886,-32536,3861,-32539,3836,-32542,3811,-32545,3786,-32548,3761,-32551,3736,-32554,3711,-32557,3686,-32559,3661,-32562,3636,-32565,3611,-32568,3586,-32571,3561,-32573,3536,-32576,3511,-32579,3486,-32581,3461,-32584,3436,-32587,3411,-32589,3386,-32592,3361,-32595,3336,-32597,3311,-32600,3286,-32602,3261,-32605,3236,-32607,3211,-32610,3186,-32612,3161,-32615,3136,-32617,3111,-32619,3086,-32622,3061,-32624,3036,-32626,3011,-32629,2986,-32631,2961,-32633,2936,-32636,2911,-32638,2886,-32640,2861,-32642,2836,-32645,2811,-32647,2786,-32649,2761,-32651,2736,-32653,2711,-32655,2686,-32657,2661,-32659,2636,-32661,2610,-32663,2585,-32665,2560,-32667,2535,-32669,2510,-32671,2485,-32673,2460,-32675,2435,-32677,2410,-32679,2385,-32681,2360,-32682,2335,-32684,2310,-32686,2285,-32688,2260,-32689,2235,-32691,2209,-32693,2184,-32695,2159,-32696,2134,-32698,2109,-32700,2084,-32701,2059,-32703,2034,-32704,2009,-32706,1984,-32707,1959,-32709,1934,-32710,1908,-32712,1883,-32713,1858,-32715,1833,-32716,1808,-32718,1783,-32719,1758,-32720,1733,-32722,1708,-32723,1683,-32724,1658,-32726,1632,-32727,1607,-32728,1582,-32729,1557,-32730,1532,-32732,1507,-32733,1482,-32734,1457,-32735,1432,-32736,1406,-32737,1381,-32738,1356,-32739,1331,-32740,1306,-32741,1281,-32742,1256,-32743,1231,-32744,1206,-32745,1180,-32746,1155,-32747,1130,-32748,1105,-32749,1080,-32750,1055,-32751,1030,-32751,1005,-32752,980,-32753,954,-32754,929,-32754,904,-32755,879,-32756,854,-32756,829,-32757,804,-32758,779,-32758,753,-32759,728,-32759,703,-32760,678,-32760,653,-32761,628,-32761,603,-32762,578,-32762,552,-32763,527,-32763,502,-32764,477,-32764,452,-32764,427,-32765,402,-32765,376,-32765,351,-32766,326,-32766,301,-32766,276,-32766,251,-32767,226,-32767,201,-32767,175,-32767,150,-32767,125,-32767,100,-32767,75,-32767,50,-32767,25,-32767,0,-32767,-26,-32767,-51,-32767,-76,-32767,-101,-32767,-126,-32767,-151,-32767,-176,-32767,-202,-32767,-227,-32767,-252,-32767,-277,-32766,-302,-32766,-327,-32766,-352,-32766,-377,-32765,-403,-32765,-428,-32765,-453,-32764,-478,-32764,-503,-32764,-528,-32763,-553,-32763,-579,-32762,-604,-32762,-629,-32761,-654,-32761,-679,-32760,-704,-32760,-729,-32759,-754,-32759,-780,-32758,-805,-32758,-830,-32757,-855,-32756,-880,-32756,-905,-32755,-930,-32754,-955,-32754,-981,-32753,-1006,-32752,-1031,-32751,-1056,-32751,-1081,-32750,-1106,-32749,-1131,-32748,-1156,-32747,-1181,-32746,-1207,-32745,-1232,-32744,-1257,-32743,-1282,-32742,-1307,-32741,-1332,-32740,-1357,-32739,-1382,-32738,-1407,-32737,-1433,-32736,-1458,-32735,-1483,-32734,-1508,-32733,-1533,-32732,-1558,-32730,-1583,-32729,-1608,-32728,-1633,-32727,-1659,-32726,-1684,-32724,-1709,-32723,-1734,-32722,-1759,-32720,-1784,-32719,-1809,-32718,-1834,-32716,-1859,-32715,-1884,-32713,-1909,-32712,-1935,-32710,-1960,-32709,-1985,-32707,-2010,-32706,-2035,-32704,-2060,-32703,-2085,-32701,-2110,-32700,-2135,-32698,-2160,-32696,-2185,-32695,-2210,-32693,-2236,-32691,-2261,-32689,-2286,-32688,-2311,-32686,-2336,-32684,-2361,-32682,-2386,-32681,-2411,-32679,-2436,-32677,-2461,-32675,-2486,-32673,-2511,-32671,-2536,-32669,-2561,-32667,-2586,-32665,-2611,-32663,-2637,-32661,-2662,-32659,-2687,-32657,-2712,-32655,-2737,-32653,-2762,-32651,-2787,-32649,-2812,-32647,-2837,-32645,-2862,-32642,-2887,-32640,-2912,-32638,-2937,-32636,-2962,-32633,-2987,-32631,-3012,-32629,-3037,-32626,-3062,-32624,-3087,-32622,-3112,-32619,-3137,-32617,-3162,-32615,-3187,-32612,-3212,-32610,-3237,-32607,-3262,-32605,-3287,-32602,-3312,-32600,-3337,-32597,-3362,-32595,-3387,-32592,-3412,-32589,-3437,-32587,-3462,-32584,-3487,-32581,-3512,-32579,-3537,-32576,-3562,-32573,-3587,-32571,-3612,-32568,-3637,-32565,-3662,-32562,-3687,-32559,-3712,-32557,-3737,-32554,-3762,-32551,-3787,-32548,-3812,-32545,-3837,-32542,-3862,-32539,-3887,-32536,-3912,-32533,-3937,-32530,-3962,-32527,-3987,-32524,-4012,-32521,-4036,-32518,-4061,-32515,-4086,-32512,-4111,-32509,-4136,-32505,-4161,-32502,-4186,-32499,-4211,-32496,-4236,-32493,-4261,-32489,-4286,-32486,-4311,-32483,-4336,-32479,-4360,-32476,-4385,-32473,-4410,-32469,-4435,-32466,-4460,-32463,-4485,-32459,-4510,-32456,-4535,-32452,-4560,-32449,-4585,-32445,-4609,-32442,-4634,-32438,-4659,-32435,-4684,-32431,-4709,-32427,-4734,-32424,-4759,-32420,-4784,-32417,-4808,-32413,-4833,-32409,-4858,-32405,-4883,-32402,-4908,-32398,-4933,-32394,-4958,-32390,-4982,-32387,-5007,-32383,-5032,-32379,-5057,-32375,-5082,-32371,-5107,-32367,-5131,-32363,-5156,-32359,-5181,-32355,-5206,-32351,-5231,-32347,-5255,-32343,-5280,-32339,-5305,-32335,-5330,-32331,-5355,-32327,-5379,-32323,-5404,-32319,-5429,-32315,-5454,-32311,-5479,-32306,-5503,-32302,-5528,-32298,-5553,-32294,-5578,-32289,-5602,-32285,-5627,-32281,-5652,-32276,-5677,-32272,-5701,-32268,-5726,-32263,-5751,-32259,-5776,-32255,-5800,-32250,-5825,-32246,-5850,-32241,-5875,-32237,-5899,-32232,-5924,-32228,-5949,-32223,-5973,-32219,-5998,-32214,-6023,-32209,-6048,-32205,-6072,-32200,-6097,-32195,-6122,-32191,-6146,-32186,-6171,-32181,-6196,-32177,-6220,-32172,-6245,-32167,-6270,-32162,-6294,-32157,-6319,-32153,-6344,-32148,-6368,-32143,-6393,-32138,-6418,-32133,-6442,-32128,-6467,-32123,-6492,-32118,-6516,-32113,-6541,-32108,-6565,-32103,-6590,-32098,-6615,-32093,-6639,-32088,-6664,-32083,-6689,-32078,-6713,-32073,-6738,-32067,-6762,-32062,-6787,-32057,-6812,-32052,-6836,-32047,-6861,-32041,-6885,-32036,-6910,-32031,-6934,-32025,-6959,-32020,-6983,-32015,-7008,-32009,-7033,-32004,-7057,-31999,-7082,-31993,-7106,-31988,-7131,-31982,-7155,-31977,-7180,-31971,-7204,-31966,-7229,-31960,-7253,-31955,-7278,-31949,-7302,-31944,-7327,-31938,-7351,-31932,-7376,-31927,-7400,-31921,-7425,-31915,-7449,-31910,-7474,-31904,-7498,-31898,-7523,-31892,-7547,-31887,-7572,-31881,-7596,-31875,-7620,-31869,-7645,-31863,-7669,-31857,-7694,-31852,-7718,-31846,-7743,-31840,-7767,-31834,-7791,-31828,-7816,-31822,-7840,-31816,-7865,-31810,-7889,-31804,-7913,-31798,-7938,-31792,-7962,-31786,-7987,-31779,-8011,-31773,-8035,-31767,-8060,-31761,-8084,-31755,-8108,-31749,-8133,-31742,-8157,-31736,-8181,-31730,-8206,-31724,-8230,-31717,-8254,-31711,-8279,-31705,-8303,-31698,-8327,-31692,-8352,-31685,-8376,-31679,-8400,-31673,-8425,-31666,-8449,-31660,-8473,-31653,-8497,-31647,-8522,-31640,-8546,-31634,-8570,-31627,-8594,-31620,-8619,-31614,-8643,-31607,-8667,-31601,-8691,-31594,-8716,-31587,-8740,-31581,-8764,-31574,-8788,-31567,-8813,-31560,-8837,-31554,-8861,-31547,-8885,-31540,-8909,-31533,-8933,-31526,-8958,-31519,-8982,-31513,-9006,-31506,-9030,-31499,-9054,-31492,-9078,-31485,-9103,-31478,-9127,-31471,-9151,-31464,-9175,-31457,-9199,-31450,-9223,-31443,-9247,-31436,-9271,-31429,-9296,-31421,-9320,-31414,-9344,-31407,-9368,-31400,-9392,-31393,-9416,-31386,-9440,-31378,-9464,-31371,-9488,-31364,-9512,-31357,-9536,-31349,-9560,-31342,-9584,-31335,-9608,-31327,-9632,-31320,-9656,-31312,-9680,-31305,-9704,-31298,-9728,-31290,-9752,-31283,-9776,-31275,-9800,-31268,-9824,-31260,-9848,-31253,-9872,-31245,-9896,-31237,-9920,-31230,-9944,-31222,-9968,-31215,-9992,-31207,-10016,-31199,-10040,-31192,-10064,-31184,-10088,-31176,-10112,-31168,-10136,-31161,-10160,-31153,-10183,-31145,-10207,-31137,-10231,-31129,-10255,-31122,-10279,-31114,-10303,-31106,-10327,-31098,-10350,-31090,-10374,-31082,-10398,-31074,-10422,-31066,-10446,-31058,-10470,-31050,-10493,-31042,-10517,-31034,-10541,-31026,-10565,-31018,-10589,-31010,-10612,-31002,-10636,-30993,-10660,-30985,-10684,-30977,-10707,-30969,-10731,-30961,-10755,-30952,-10779,-30944,-10802,-30936,-10826,-30928,-10850,-30919,-10874,-30911,-10897,-30903,-10921,-30894,-10945,-30886,-10968,-30877,-10992,-30869,-11016,-30861,-11039,-30852,-11063,-30844,-11087,-30835,-11110,-30827,-11134,-30818,-11158,-30810,-11181,-30801,-11205,-30792,-11228,-30784,-11252,-30775,-11276,-30767,-11299,-30758,-11323,-30749,-11346,-30740,-11370,-30732,-11394,-30723,-11417,-30714,-11441,-30706,-11464,-30697,-11488,-30688,-11511,-30679,-11535,-30670,-11558,-30661,-11582,-30653,-11605,-30644,-11629,-30635,-11652,-30626,-11676,-30617,-11699,-30608,-11723,-30599,-11746,-30590,-11770,-30581,-11793,-30572,-11817,-30563,-11840,-30554,-11863,-30545,-11887,-30536,-11910,-30526,-11934,-30517,-11957,-30508,-11981,-30499,-12004,-30490,-12027,-30481,-12051,-30471,-12074,-30462,-12097,-30453,-12121,-30443,-12144,-30434,-12167,-30425,-12191,-30416,-12214,-30406,-12237,-30397,-12261,-30387,-12284,-30378,-12307,-30369,-12331,-30359,-12354,-30350,-12377,-30340,-12400,-30331,-12424,-30321,-12447,-30312,-12470,-30302,-12493,-30292,-12517,-30283,-12540,-30273,-12563,-30264,-12586,-30254,-12610,-30244,-12633,-30235,-12656,-30225,-12679,-30215,-12702,-30206,-12725,-30196,-12749,-30186,-12772,-30176,-12795,-30166,-12818,-30157,-12841,-30147,-12864,-30137,-12887,-30127,-12910,-30117,-12934,-30107,-12957,-30097,-12980,-30087,-13003,-30077,-13026,-30067,-13049,-30057,-13072,-30047,-13095,-30037,-13118,-30027,-13141,-30017,-13164,-30007,-13187,-29997,-13210,-29987,-13233,-29977,-13256,-29967,-13279,-29956,-13302,-29946,-13325,-29936,-13348,-29926,-13371,-29916,-13394,-29905,-13417,-29895,-13440,-29885,-13463,-29874,-13486,-29864,-13508,-29854,-13531,-29843,-13554,-29833,-13577,-29823,-13600,-29812,-13623,-29802,-13646,-29791,-13668,-29781,-13691,-29770,-13714,-29760,-13737,-29749,-13760,-29739,-13783,-29728,-13805,-29718,-13828,-29707,-13851,-29696,-13874,-29686,-13896,-29675,-13919,-29664,-13942,-29654,-13965,-29643,-13987,-29632,-14010,-29622,-14033,-29611,-14056,-29600,-14078,-29589,-14101,-29578,-14124,-29568,-14146,-29557,-14169,-29546,-14192,-29535,-14214,-29524,-14237,-29513,-14260,-29502,-14282,-29491,-14305,-29480,-14327,-29469,-14350,-29458,-14373,-29447,-14395,-29436,-14418,-29425,-14440,-29414,-14463,-29403,-14485,-29392,-14508,-29381,-14531,-29370,-14553,-29359,-14576,-29347,-14598,-29336,-14621,-29325,-14643,-29314,-14666,-29303,-14688,-29291,-14710,-29280,-14733,-29269,-14755,-29257,-14778,-29246,-14800,-29235,-14823,-29223,-14845,-29212,-14867,-29201,-14890,-29189,-14912,-29178,-14935,-29166,-14957,-29155,-14979,-29143,-15002,-29132,-15024,-29120,-15046,-29109,-15069,-29097,-15091,-29086,-15113,-29074,-15136,-29063,-15158,-29051,-15180,-29039,-15202,-29028,-15225,-29016,-15247,-29004,-15269,-28993,-15291,-28981,-15314,-28969,-15336,-28957,-15358,-28946,-15380,-28934,-15402,-28922,-15425,-28910,-15447,-28898,-15469,-28887,-15491,-28875,-15513,-28863,-15535,-28851,-15557,-28839,-15580,-28827,-15602,-28815,-15624,-28803,-15646,-28791,-15668,-28779,-15690,-28767,-15712,-28755,-15734,-28743,-15756,-28731,-15778,-28719,-15800,-28707,-15822,-28695,-15844,-28682,-15866,-28670,-15888,-28658,-15910,-28646,-15932,-28634,-15954,-28621,-15976,-28609,-15998,-28597,-16020,-28585,-16042,-28572,-16064,-28560,-16086,-28548,-16108,-28535,-16129,-28523,-16151,-28511,-16173,-28498,-16195,-28486,-16217,-28473,-16239,-28461,-16261,-28448,-16282,-28436,-16304,-28424,-16326,-28411,-16348,-28398,-16369,-28386,-16391,-28373,-16413,-28361,-16435,-28348,-16456,-28336,-16478,-28323,-16500,-28310,-16522,-28298,-16543,-28285,-16565,-28272,-16587,-28260,-16608,-28247,-16630,-28234,-16652,-28221,-16673,-28209,-16695,-28196,-16717,-28183,-16738,-28170,-16760,-28157,-16781,-28144,-16803,-28132,-16825,-28119,-16846,-28106,-16868,-28093,-16889,-28080,-16911,-28067,-16932,-28054,-16954,-28041,-16975,-28028,-16997,-28015,-17018,-28002,-17040,-27989,-17061,-27976,-17083,-27963,-17104,-27949,-17125,-27936,-17147,-27923,-17168,-27910,-17190,-27897,-17211,-27884,-17233,-27870,-17254,-27857,-17275,-27844,-17297,-27831,-17318,-27817,-17339,-27804,-17361,-27791,-17382,-27778,-17403,-27764,-17424,-27751,-17446,-27737,-17467,-27724,-17488,-27711,-17510,-27697,-17531,-27684,-17552,-27670,-17573,-27657,-17594,-27643,-17616,-27630,-17637,-27616,-17658,-27603,-17679,-27589,-17700,-27576,-17721,-27562,-17743,-27549,-17764,-27535,-17785,-27521,-17806,-27508,-17827,-27494,-17848,-27480,-17869,-27467,-17890,-27453,-17911,-27439,-17932,-27425,-17953,-27412,-17974,-27398,-17995,-27384,-18016,-27370,-18037,-27356,-18058,-27343,-18079,-27329,-18100,-27315,-18121,-27301,-18142,-27287,-18163,-27273,-18184,-27259,-18205,-27245,-18226,-27231,-18247,-27217,-18268,-27203,-18288,-27189,-18309,-27175,-18330,-27161,-18351,-27147,-18372,-27133,-18393,-27119,-18413,-27105,-18434,-27091,-18455,-27077,-18476,-27062,-18496,-27048,-18517,-27034,-18538,-27020,-18559,-27006,-18579,-26991,-18600,-26977,-18621,-26963,-18641,-26949,-18662,-26934,-18683,-26920,-18703,-26906,-18724,-26891,-18745,-26877,-18765,-26862,-18786,-26848,-18806,-26834,-18827,-26819,-18847,-26805,-18868,-26790,-18889,-26776,-18909,-26761,-18930,-26747,-18950,-26732,-18971,-26718,-18991,-26703,-19012,-26689,-19032,-26674,-19052,-26659,-19073,-26645,-19093,-26630,-19114,-26616,-19134,-26601,-19155,-26586,-19175,-26571,-19195,-26557,-19216,-26542,-19236,-26527,-19256,-26513,-19277,-26498,-19297,-26483,-19317,-26468,-19338,-26453,-19358,-26438,-19378,-26424,-19398,-26409,-19419,-26394,-19439,-26379,-19459,-26364,-19479,-26349,-19500,-26334,-19520,-26319,-19540,-26304,-19560,-26289,-19580,-26274,-19600,-26259,-19621,-26244,-19641,-26229,-19661,-26214,-19681,-26199,-19701,-26184,-19721,-26169,-19741,-26154,-19761,-26138,-19781,-26123,-19801,-26108,-19821,-26093,-19841,-26078,-19861,-26062,-19881,-26047,-19901,-26032,-19921,-26017,-19941,-26001,-19961,-25986,-19981,-25971,-20001,-25955,-20021,-25940,-20041,-25925,-20061,-25909,-20080,-25894,-20100,-25879,-20120,-25863,-20140,-25848,-20160,-25832,-20180,-25817,-20199,-25801,-20219,-25786,-20239,-25770,-20259,-25755,-20278,-25739,-20298,-25724,-20318,-25708,-20338,-25692,-20357,-25677,-20377,-25661,-20397,-25646,-20416,-25630,-20436,-25614,-20456,-25599,-20475,-25583,-20495,-25567,-20514,-25551,-20534,-25536,-20554,-25520,-20573,-25504,-20593,-25488,-20612,-25473,-20632,-25457,-20651,-25441,-20671,-25425,-20690,-25409,-20710,-25393,-20729,-25377,-20749,-25362,-20768,-25346,-20788,-25330,-20807,-25314,-20826,-25298,-20846,-25282,-20865,-25266,-20885,-25250,-20904,-25234,-20923,-25218,-20943,-25202,-20962,-25186,-20981,-25170,-21001,-25153,-21020,-25137,-21039,-25121,-21058,-25105,-21078,-25089,-21097,-25073,-21116,-25057,-21135,-25040,-21155,-25024,-21174,-25008,-21193,-24992,-21212,-24975,-21231,-24959,-21250,-24943,-21269,-24927,-21289,-24910,-21308,-24894,-21327,-24878,-21346,-24861,-21365,-24845,-21384,-24828,-21403,-24812,-21422,-24796,-21441,-24779,-21460,-24763,-21479,-24746,-21498,-24730,-21517,-24713,-21536,-24697,-21555,-24680,-21574,-24664,-21593,-24647,-21612,-24631,-21630,-24614,-21649,-24597,-21668,-24581,-21687,-24564,-21706,-24547,-21725,-24531,-21744,-24514,-21762,-24497,-21781,-24481,-21800,-24464,-21819,-24447,-21837,-24431,-21856,-24414,-21875,-24397,-21894,-24380,-21912,-24363,-21931,-24347,-21950,-24330,-21968,-24313,-21987,-24296,-22005,-24279,-22024,-24262,-22043,-24245,-22061,-24229,-22080,-24212,-22098,-24195,-22117,-24178,-22136,-24161,-22154,-24144,-22173,-24127,-22191,-24110,-22210,-24093,-22228,-24076,-22246,-24059,-22265,-24042,-22283,-24024,-22302,-24007,-22320,-23990,-22339,-23973,-22357,-23956,-22375,-23939,-22394,-23922,-22412,-23904,-22430,-23887,-22449,-23870,-22467,-23853,-22485,-23836,-22504,-23818,-22522,-23801,-22540,-23784,-22558,-23767,-22576,-23749,-22595,-23732,-22613,-23715,-22631,-23697,-22649,-23680,-22667,-23662,-22686,-23645,-22704,-23628,-22722,-23610,-22740,-23593,-22758,-23575,-22776,-23558,-22794,-23540,-22812,-23523,-22830,-23505,-22848,-23488,-22866,-23470,-22884,-23453,-22902,-23435,-22920,-23418,-22938,-23400,-22956,-23383,-22974,-23365,-22992,-23347,-23010,-23330,-23028,-23312,-23046,-23294,-23063,-23277,-23081,-23259,-23099,-23241,-23117,-23224,-23135,-23206,-23152,-23188,-23170,-23170,-23188,-23152,-23206,-23135,-23224,-23117,-23241,-23099,-23259,-23081,-23277,-23063,-23294,-23046,-23312,-23028,-23330,-23010,-23347,-22992,-23365,-22974,-23383,-22956,-23400,-22938,-23418,-22920,-23435,-22902,-23453,-22884,-23470,-22866,-23488,-22848,-23505,-22830,-23523,-22812,-23540,-22794,-23558,-22776,-23575,-22758,-23593,-22740,-23610,-22722,-23628,-22704,-23645,-22686,-23662,-22667,-23680,-22649,-23697,-22631,-23715,-22613,-23732,-22595,-23749,-22576,-23767,-22558,-23784,-22540,-23801,-22522,-23818,-22504,-23836,-22485,-23853,-22467,-23870,-22449,-23887,-22430,-23904,-22412,-23922,-22394,-23939,-22375,-23956,-22357,-23973,-22339,-23990,-22320,-24007,-22302,-24024,-22283,-24042,-22265,-24059,-22246,-24076,-22228,-24093,-22210,-24110,-22191,-24127,-22173,-24144,-22154,-24161,-22136,-24178,-22117,-24195,-22098,-24212,-22080,-24229,-22061,-24245,-22043,-24262,-22024,-24279,-22005,-24296,-21987,-24313,-21968,-24330,-21950,-24347,-21931,-24363,-21912,-24380,-21894,-24397,-21875,-24414,-21856,-24431,-21837,-24447,-21819,-24464,-21800,-24481,-21781,-24497,-21762,-24514,-21744,-24531,-21725,-24547,-21706,-24564,-21687,-24581,-21668,-24597,-21649,-24614,-21630,-24631,-21612,-24647,-21593,-24664,-21574,-24680,-21555,-24697,-21536,-24713,-21517,-24730,-21498,-24746,-21479,-24763,-21460,-24779,-21441,-24796,-21422,-24812,-21403,-24828,-21384,-24845,-21365,-24861,-21346,-24878,-21327,-24894,-21308,-24910,-21289,-24927,-21269,-24943,-21250,-24959,-21231,-24975,-21212,-24992,-21193,-25008,-21174,-25024,-21155,-25040,-21135,-25057,-21116,-25073,-21097,-25089,-21078,-25105,-21058,-25121,-21039,-25137,-21020,-25153,-21001,-25170,-20981,-25186,-20962,-25202,-20943,-25218,-20923,-25234,-20904,-25250,-20885,-25266,-20865,-25282,-20846,-25298,-20826,-25314,-20807,-25330,-20788,-25346,-20768,-25362,-20749,-25377,-20729,-25393,-20710,-25409,-20690,-25425,-20671,-25441,-20651,-25457,-20632,-25473,-20612,-25488,-20593,-25504,-20573,-25520,-20554,-25536,-20534,-25551,-20514,-25567,-20495,-25583,-20475,-25599,-20456,-25614,-20436,-25630,-20416,-25646,-20397,-25661,-20377,-25677,-20357,-25692,-20338,-25708,-20318,-25724,-20298,-25739,-20278,-25755,-20259,-25770,-20239,-25786,-20219,-25801,-20199,-25817,-20180,-25832,-20160,-25848,-20140,-25863,-20120,-25879,-20100,-25894,-20080,-25909,-20061,-25925,-20041,-25940,-20021,-25955,-20001,-25971,-19981,-25986,-19961,-26001,-19941,-26017,-19921,-26032,-19901,-26047,-19881,-26062,-19861,-26078,-19841,-26093,-19821,-26108,-19801,-26123,-19781,-26138,-19761,-26154,-19741,-26169,-19721,-26184,-19701,-26199,-19681,-26214,-19661,-26229,-19641,-26244,-19621,-26259,-19600,-26274,-19580,-26289,-19560,-26304,-19540,-26319,-19520,-26334,-19500,-26349,-19479,-26364,-19459,-26379,-19439,-26394,-19419,-26409,-19398,-26424,-19378,-26438,-19358,-26453,-19338,-26468,-19317,-26483,-19297,-26498,-19277,-26513,-19256,-26527,-19236,-26542,-19216,-26557,-19195,-26571,-19175,-26586,-19155,-26601,-19134,-26616,-19114,-26630,-19093,-26645,-19073,-26659,-19052,-26674,-19032,-26689,-19012,-26703,-18991,-26718,-18971,-26732,-18950,-26747,-18930,-26761,-18909,-26776,-18889,-26790,-18868,-26805,-18847,-26819,-18827,-26834,-18806,-26848,-18786,-26862,-18765,-26877,-18745,-26891,-18724,-26906,-18703,-26920,-18683,-26934,-18662,-26949,-18641,-26963,-18621,-26977,-18600,-26991,-18579,-27006,-18559,-27020,-18538,-27034,-18517,-27048,-18496,-27062,-18476,-27077,-18455,-27091,-18434,-27105,-18413,-27119,-18393,-27133,-18372,-27147,-18351,-27161,-18330,-27175,-18309,-27189,-18288,-27203,-18268,-27217,-18247,-27231,-18226,-27245,-18205,-27259,-18184,-27273,-18163,-27287,-18142,-27301,-18121,-27315,-18100,-27329,-18079,-27343,-18058,-27356,-18037,-27370,-18016,-27384,-17995,-27398,-17974,-27412,-17953,-27425,-17932,-27439,-17911,-27453,-17890,-27467,-17869,-27480,-17848,-27494,-17827,-27508,-17806,-27521,-17785,-27535,-17764,-27549,-17743,-27562,-17721,-27576,-17700,-27589,-17679,-27603,-17658,-27616,-17637,-27630,-17616,-27643,-17594,-27657,-17573,-27670,-17552,-27684,-17531,-27697,-17510,-27711,-17488,-27724,-17467,-27737,-17446,-27751,-17424,-27764,-17403,-27778,-17382,-27791,-17361,-27804,-17339,-27817,-17318,-27831,-17297,-27844,-17275,-27857,-17254,-27870,-17233,-27884,-17211,-27897,-17190,-27910,-17168,-27923,-17147,-27936,-17125,-27949,-17104,-27963,-17083,-27976,-17061,-27989,-17040,-28002,-17018,-28015,-16997,-28028,-16975,-28041,-16954,-28054,-16932,-28067,-16911,-28080,-16889,-28093,-16868,-28106,-16846,-28119,-16825,-28132,-16803,-28144,-16781,-28157,-16760,-28170,-16738,-28183,-16717,-28196,-16695,-28209,-16673,-28221,-16652,-28234,-16630,-28247,-16608,-28260,-16587,-28272,-16565,-28285,-16543,-28298,-16522,-28310,-16500,-28323,-16478,-28336,-16456,-28348,-16435,-28361,-16413,-28373,-16391,-28386,-16369,-28398,-16348,-28411,-16326,-28424,-16304,-28436,-16282,-28448,-16261,-28461,-16239,-28473,-16217,-28486,-16195,-28498,-16173,-28511,-16151,-28523,-16129,-28535,-16108,-28548,-16086,-28560,-16064,-28572,-16042,-28585,-16020,-28597,-15998,-28609,-15976,-28621,-15954,-28634,-15932,-28646,-15910,-28658,-15888,-28670,-15866,-28682,-15844,-28695,-15822,-28707,-15800,-28719,-15778,-28731,-15756,-28743,-15734,-28755,-15712,-28767,-15690,-28779,-15668,-28791,-15646,-28803,-15624,-28815,-15602,-28827,-15580,-28839,-15557,-28851,-15535,-28863,-15513,-28875,-15491,-28887,-15469,-28898,-15447,-28910,-15425,-28922,-15402,-28934,-15380,-28946,-15358,-28957,-15336,-28969,-15314,-28981,-15291,-28993,-15269,-29004,-15247,-29016,-15225,-29028,-15202,-29039,-15180,-29051,-15158,-29063,-15136,-29074,-15113,-29086,-15091,-29097,-15069,-29109,-15046,-29120,-15024,-29132,-15002,-29143,-14979,-29155,-14957,-29166,-14935,-29178,-14912,-29189,-14890,-29201,-14867,-29212,-14845,-29223,-14823,-29235,-14800,-29246,-14778,-29257,-14755,-29269,-14733,-29280,-14710,-29291,-14688,-29303,-14666,-29314,-14643,-29325,-14621,-29336,-14598,-29347,-14576,-29359,-14553,-29370,-14531,-29381,-14508,-29392,-14485,-29403,-14463,-29414,-14440,-29425,-14418,-29436,-14395,-29447,-14373,-29458,-14350,-29469,-14327,-29480,-14305,-29491,-14282,-29502,-14260,-29513,-14237,-29524,-14214,-29535,-14192,-29546,-14169,-29557,-14146,-29568,-14124,-29578,-14101,-29589,-14078,-29600,-14056,-29611,-14033,-29622,-14010,-29632,-13987,-29643,-13965,-29654,-13942,-29664,-13919,-29675,-13896,-29686,-13874,-29696,-13851,-29707,-13828,-29718,-13805,-29728,-13783,-29739,-13760,-29749,-13737,-29760,-13714,-29770,-13691,-29781,-13668,-29791,-13646,-29802,-13623,-29812,-13600,-29823,-13577,-29833,-13554,-29843,-13531,-29854,-13508,-29864,-13486,-29874,-13463,-29885,-13440,-29895,-13417,-29905,-13394,-29916,-13371,-29926,-13348,-29936,-13325,-29946,-13302,-29956,-13279,-29967,-13256,-29977,-13233,-29987,-13210,-29997,-13187,-30007,-13164,-30017,-13141,-30027,-13118,-30037,-13095,-30047,-13072,-30057,-13049,-30067,-13026,-30077,-13003,-30087,-12980,-30097,-12957,-30107,-12934,-30117,-12910,-30127,-12887,-30137,-12864,-30147,-12841,-30157,-12818,-30166,-12795,-30176,-12772,-30186,-12749,-30196,-12725,-30206,-12702,-30215,-12679,-30225,-12656,-30235,-12633,-30244,-12610,-30254,-12586,-30264,-12563,-30273,-12540,-30283,-12517,-30292,-12493,-30302,-12470,-30312,-12447,-30321,-12424,-30331,-12400,-30340,-12377,-30350,-12354,-30359,-12331,-30369,-12307,-30378,-12284,-30387,-12261,-30397,-12237,-30406,-12214,-30416,-12191,-30425,-12167,-30434,-12144,-30443,-12121,-30453,-12097,-30462,-12074,-30471,-12051,-30481,-12027,-30490,-12004,-30499,-11981,-30508,-11957,-30517,-11934,-30526,-11910,-30536,-11887,-30545,-11863,-30554,-11840,-30563,-11817,-30572,-11793,-30581,-11770,-30590,-11746,-30599,-11723,-30608,-11699,-30617,-11676,-30626,-11652,-30635,-11629,-30644,-11605,-30653,-11582,-30661,-11558,-30670,-11535,-30679,-11511,-30688,-11488,-30697,-11464,-30706,-11441,-30714,-11417,-30723,-11394,-30732,-11370,-30740,-11346,-30749,-11323,-30758,-11299,-30767,-11276,-30775,-11252,-30784,-11228,-30792,-11205,-30801,-11181,-30810,-11158,-30818,-11134,-30827,-11110,-30835,-11087,-30844,-11063,-30852,-11039,-30861,-11016,-30869,-10992,-30877,-10968,-30886,-10945,-30894,-10921,-30903,-10897,-30911,-10874,-30919,-10850,-30928,-10826,-30936,-10802,-30944,-10779,-30952,-10755,-30961,-10731,-30969,-10707,-30977,-10684,-30985,-10660,-30993,-10636,-31002,-10612,-31010,-10589,-31018,-10565,-31026,-10541,-31034,-10517,-31042,-10493,-31050,-10470,-31058,-10446,-31066,-10422,-31074,-10398,-31082,-10374,-31090,-10350,-31098,-10327,-31106,-10303,-31114,-10279,-31122,-10255,-31129,-10231,-31137,-10207,-31145,-10183,-31153,-10160,-31161,-10136,-31168,-10112,-31176,-10088,-31184,-10064,-31192,-10040,-31199,-10016,-31207,-9992,-31215,-9968,-31222,-9944,-31230,-9920,-31237,-9896,-31245,-9872,-31253,-9848,-31260,-9824,-31268,-9800,-31275,-9776,-31283,-9752,-31290,-9728,-31298,-9704,-31305,-9680,-31312,-9656,-31320,-9632,-31327,-9608,-31335,-9584,-31342,-9560,-31349,-9536,-31357,-9512,-31364,-9488,-31371,-9464,-31378,-9440,-31386,-9416,-31393,-9392,-31400,-9368,-31407,-9344,-31414,-9320,-31421,-9296,-31429,-9271,-31436,-9247,-31443,-9223,-31450,-9199,-31457,-9175,-31464,-9151,-31471,-9127,-31478,-9103,-31485,-9078,-31492,-9054,-31499,-9030,-31506,-9006,-31513,-8982,-31519,-8958,-31526,-8933,-31533,-8909,-31540,-8885,-31547,-8861,-31554,-8837,-31560,-8813,-31567,-8788,-31574,-8764,-31581,-8740,-31587,-8716,-31594,-8691,-31601,-8667,-31607,-8643,-31614,-8619,-31620,-8594,-31627,-8570,-31634,-8546,-31640,-8522,-31647,-8497,-31653,-8473,-31660,-8449,-31666,-8425,-31673,-8400,-31679,-8376,-31685,-8352,-31692,-8327,-31698,-8303,-31705,-8279,-31711,-8254,-31717,-8230,-31724,-8206,-31730,-8181,-31736,-8157,-31742,-8133,-31749,-8108,-31755,-8084,-31761,-8060,-31767,-8035,-31773,-8011,-31779,-7987,-31786,-7962,-31792,-7938,-31798,-7913,-31804,-7889,-31810,-7865,-31816,-7840,-31822,-7816,-31828,-7791,-31834,-7767,-31840,-7743,-31846,-7718,-31852,-7694,-31857,-7669,-31863,-7645,-31869,-7620,-31875,-7596,-31881,-7572,-31887,-7547,-31892,-7523,-31898,-7498,-31904,-7474,-31910,-7449,-31915,-7425,-31921,-7400,-31927,-7376,-31932,-7351,-31938,-7327,-31944,-7302,-31949,-7278,-31955,-7253,-31960,-7229,-31966,-7204,-31971,-7180,-31977,-7155,-31982,-7131,-31988,-7106,-31993,-7082,-31999,-7057,-32004,-7033,-32009,-7008,-32015,-6983,-32020,-6959,-32025,-6934,-32031,-6910,-32036,-6885,-32041,-6861,-32047,-6836,-32052,-6812,-32057,-6787,-32062,-6762,-32067,-6738,-32073,-6713,-32078,-6689,-32083,-6664,-32088,-6639,-32093,-6615,-32098,-6590,-32103,-6565,-32108,-6541,-32113,-6516,-32118,-6492,-32123,-6467,-32128,-6442,-32133,-6418,-32138,-6393,-32143,-6368,-32148,-6344,-32153,-6319,-32157,-6294,-32162,-6270,-32167,-6245,-32172,-6220,-32177,-6196,-32181,-6171,-32186,-6146,-32191,-6122,-32195,-6097,-32200,-6072,-32205,-6048,-32209,-6023,-32214,-5998,-32219,-5973,-32223,-5949,-32228,-5924,-32232,-5899,-32237,-5875,-32241,-5850,-32246,-5825,-32250,-5800,-32255,-5776,-32259,-5751,-32263,-5726,-32268,-5701,-32272,-5677,-32276,-5652,-32281,-5627,-32285,-5602,-32289,-5578,-32294,-5553,-32298,-5528,-32302,-5503,-32306,-5479,-32311,-5454,-32315,-5429,-32319,-5404,-32323,-5379,-32327,-5355,-32331,-5330,-32335,-5305,-32339,-5280,-32343,-5255,-32347,-5231,-32351,-5206,-32355,-5181,-32359,-5156,-32363,-5131,-32367,-5107,-32371,-5082,-32375,-5057,-32379,-5032,-32383,-5007,-32387,-4982,-32390,-4958,-32394,-4933,-32398,-4908,-32402,-4883,-32405,-4858,-32409,-4833,-32413,-4808,-32417,-4784,-32420,-4759,-32424,-4734,-32427,-4709,-32431,-4684,-32435,-4659,-32438,-4634,-32442,-4609,-32445,-4585,-32449,-4560,-32452,-4535,-32456,-4510,-32459,-4485,-32463,-4460,-32466,-4435,-32469,-4410,-32473,-4385,-32476,-4360,-32479,-4336,-32483,-4311,-32486,-4286,-32489,-4261,-32493,-4236,-32496,-4211,-32499,-4186,-32502,-4161,-32505,-4136,-32509,-4111,-32512,-4086,-32515,-4061,-32518,-4036,-32521,-4012,-32524,-3987,-32527,-3962,-32530,-3937,-32533,-3912,-32536,-3887,-32539,-3862,-32542,-3837,-32545,-3812,-32548,-3787,-32551,-3762,-32554,-3737,-32557,-3712,-32559,-3687,-32562,-3662,-32565,-3637,-32568,-3612,-32571,-3587,-32573,-3562,-32576,-3537,-32579,-3512,-32581,-3487,-32584,-3462,-32587,-3437,-32589,-3412,-32592,-3387,-32595,-3362,-32597,-3337,-32600,-3312,-32602,-3287,-32605,-3262,-32607,-3237,-32610,-3212,-32612,-3187,-32615,-3162,-32617,-3137,-32619,-3112,-32622,-3087,-32624,-3062,-32626,-3037,-32629,-3012,-32631,-2987,-32633,-2962,-32636,-2937,-32638,-2912,-32640,-2887,-32642,-2862,-32645,-2837,-32647,-2812,-32649,-2787,-32651,-2762,-32653,-2737,-32655,-2712,-32657,-2687,-32659,-2662,-32661,-2637,-32663,-2611,-32665,-2586,-32667,-2561,-32669,-2536,-32671,-2511,-32673,-2486,-32675,-2461,-32677,-2436,-32679,-2411,-32681,-2386,-32682,-2361,-32684,-2336,-32686,-2311,-32688,-2286,-32689,-2261,-32691,-2236,-32693,-2210,-32695,-2185,-32696,-2160,-32698,-2135,-32700,-2110,-32701,-2085,-32703,-2060,-32704,-2035,-32706,-2010,-32707,-1985,-32709,-1960,-32710,-1935,-32712,-1909,-32713,-1884,-32715,-1859,-32716,-1834,-32718,-1809,-32719,-1784,-32720,-1759,-32722,-1734,-32723,-1709,-32724,-1684,-32726,-1659,-32727,-1633,-32728,-1608,-32729,-1583,-32730,-1558,-32732,-1533,-32733,-1508,-32734,-1483,-32735,-1458,-32736,-1433,-32737,-1407,-32738,-1382,-32739,-1357,-32740,-1332,-32741,-1307,-32742,-1282,-32743,-1257,-32744,-1232,-32745,-1207,-32746,-1181,-32747,-1156,-32748,-1131,-32749,-1106,-32750,-1081,-32751,-1056,-32751,-1031,-32752,-1006,-32753,-981,-32754,-955,-32754,-930,-32755,-905,-32756,-880,-32756,-855,-32757,-830,-32758,-805,-32758,-780,-32759,-754,-32759,-729,-32760,-704,-32760,-679,-32761,-654,-32761,-629,-32762,-604,-32762,-579,-32763,-553,-32763,-528,-32764,-503,-32764,-478,-32764,-453,-32765,-428,-32765,-403,-32765,-377,-32766,-352,-32766,-327,-32766,-302,-32766,-277,-32767,-252,-32767,-227,-32767,-202,-32767,-176,-32767,-151,-32767,-126,-32767,-101,-32767,-76,-32767,-51,-32767,-26};
...@@ -77,9 +77,9 @@ extern int exit_openair; ...@@ -77,9 +77,9 @@ extern int exit_openair;
//extern void do_OFDM_mod(mod_sym_t **txdataF, int32_t **txdata, uint32_t frame, uint16_t next_slot, LTE_DL_FRAME_PARMS *frame_parms); //extern void do_OFDM_mod(mod_sym_t **txdataF, int32_t **txdata, uint32_t frame, uint16_t next_slot, LTE_DL_FRAME_PARMS *frame_parms);
unsigned char dlsch_input_buffer[2700] __attribute__ ((aligned(16))); unsigned char dlsch_input_buffer[2700] __attribute__ ((aligned(32)));
int eNB_sync_buffer0[640*6] __attribute__ ((aligned(16))); int eNB_sync_buffer0[640*6] __attribute__ ((aligned(32)));
int eNB_sync_buffer1[640*6] __attribute__ ((aligned(16))); int eNB_sync_buffer1[640*6] __attribute__ ((aligned(32)));
int *eNB_sync_buffer[2] = {eNB_sync_buffer0, eNB_sync_buffer1}; int *eNB_sync_buffer[2] = {eNB_sync_buffer0, eNB_sync_buffer1};
extern uint16_t hundred_times_log10_NPRB[100]; extern uint16_t hundred_times_log10_NPRB[100];
......
...@@ -215,7 +215,7 @@ int main(int argc, char **argv) ...@@ -215,7 +215,7 @@ int main(int argc, char **argv)
// void *data; // void *data;
// int ii; // int ii;
// int bler; // int bler;
double blerr[4],uncoded_ber;//,avg_ber; double blerr[4],uncoded_ber,avg_ber;
short *uncoded_ber_bit=NULL; short *uncoded_ber_bit=NULL;
uint8_t N_RB_DL=25,osf=1; uint8_t N_RB_DL=25,osf=1;
frame_t frame_type = FDD; frame_t frame_type = FDD;
...@@ -3290,7 +3290,7 @@ PMI_FEEDBACK: ...@@ -3290,7 +3290,7 @@ PMI_FEEDBACK:
PHY_vars_UE->dlsch_ue[0][cw]->harq_processes[PHY_vars_UE->dlsch_ue[0][cw]->current_harq_pid]->G = coded_bits_per_codeword; PHY_vars_UE->dlsch_ue[0][cw]->harq_processes[PHY_vars_UE->dlsch_ue[0][cw]->current_harq_pid]->G = coded_bits_per_codeword;
/*
// calculate uncoded BLER // calculate uncoded BLER
uncoded_ber=0; uncoded_ber=0;
for (i=0;i<coded_bits_per_codeword;i++) for (i=0;i<coded_bits_per_codeword;i++)
...@@ -3306,7 +3306,7 @@ PMI_FEEDBACK: ...@@ -3306,7 +3306,7 @@ PMI_FEEDBACK:
if (n_frames==1) if (n_frames==1)
write_output("uncoded_ber_bit.m","uncoded_ber_bit",uncoded_ber_bit,coded_bits_per_codeword,1,0); write_output("uncoded_ber_bit.m","uncoded_ber_bit",uncoded_ber_bit,coded_bits_per_codeword,1,0);
*/
start_meas(&PHY_vars_UE->dlsch_unscrambling_stats); start_meas(&PHY_vars_UE->dlsch_unscrambling_stats);
dlsch_unscrambling(&PHY_vars_UE->lte_frame_parms, dlsch_unscrambling(&PHY_vars_UE->lte_frame_parms,
...@@ -3417,12 +3417,12 @@ PMI_FEEDBACK: ...@@ -3417,12 +3417,12 @@ PMI_FEEDBACK:
} }
sprintf(fname,"rxsig0_r%d.m",round); sprintf(fname,"rxsig0_r%d.m",round);
sprintf(vname,"rxs0_r%d.m",round); sprintf(vname,"rxs0_r%d",round);
write_output(fname,vname, &PHY_vars_UE->lte_ue_common_vars.rxdata[0][0],10*PHY_vars_UE->lte_frame_parms.samples_per_tti,1,1); write_output(fname,vname, &PHY_vars_UE->lte_ue_common_vars.rxdata[0][0],10*PHY_vars_UE->lte_frame_parms.samples_per_tti,1,1);
sprintf(fname,"rxsigF0_r%d.m",round); sprintf(fname,"rxsigF0_r%d.m",round);
sprintf(vname,"rxs0F_r%d.m",round); sprintf(vname,"rxs0F_r%d",round);
write_output(fname,vname, &PHY_vars_UE->lte_ue_common_vars.rxdataF[0][0],2*PHY_vars_UE->lte_frame_parms.ofdm_symbol_size*nsymb,2,1); write_output(fname,vname, &PHY_vars_UE->lte_ue_common_vars.rxdataF[0][0],2*PHY_vars_UE->lte_frame_parms.ofdm_symbol_size*nsymb,2,1);
if (PHY_vars_UE->lte_frame_parms.nb_antennas_rx>1) { if (PHY_vars_UE->lte_frame_parms.nb_antennas_rx>1) {
sprintf(fname,"rxsig1_r%d.m",round); sprintf(fname,"rxsig1_r%d.m",round);
sprintf(vname,"rxs1_r%d.m",round); sprintf(vname,"rxs1_r%d.m",round);
...@@ -3433,14 +3433,14 @@ PMI_FEEDBACK: ...@@ -3433,14 +3433,14 @@ PMI_FEEDBACK:
} }
sprintf(fname,"dlsch00_r%d.m",round); sprintf(fname,"dlsch00_r%d.m",round);
sprintf(vname,"dl00_r%d.m",round); sprintf(vname,"dl00_r%d",round);
write_output(fname,vname, write_output(fname,vname,
&(PHY_vars_UE->lte_ue_common_vars.dl_ch_estimates[eNB_id][0][0]), &(PHY_vars_UE->lte_ue_common_vars.dl_ch_estimates[eNB_id][0][0]),
PHY_vars_UE->lte_frame_parms.ofdm_symbol_size*nsymb,1,1); PHY_vars_UE->lte_frame_parms.ofdm_symbol_size*nsymb,1,1);
if (PHY_vars_UE->lte_frame_parms.nb_antennas_rx>1) { if (PHY_vars_UE->lte_frame_parms.nb_antennas_rx>1) {
sprintf(fname,"dlsch01_r%d.m",round); sprintf(fname,"dlsch01_r%d.m",round);
sprintf(vname,"dl01_r%d.m",round); sprintf(vname,"dl01_r%d",round);
write_output(fname,vname, write_output(fname,vname,
&(PHY_vars_UE->lte_ue_common_vars.dl_ch_estimates[eNB_id][1][0]), &(PHY_vars_UE->lte_ue_common_vars.dl_ch_estimates[eNB_id][1][0]),
PHY_vars_UE->lte_frame_parms.ofdm_symbol_size*nsymb/2,1,1); PHY_vars_UE->lte_frame_parms.ofdm_symbol_size*nsymb/2,1,1);
...@@ -3448,7 +3448,7 @@ PMI_FEEDBACK: ...@@ -3448,7 +3448,7 @@ PMI_FEEDBACK:
if (PHY_vars_eNB->lte_frame_parms.nb_antennas_tx>1) { if (PHY_vars_eNB->lte_frame_parms.nb_antennas_tx>1) {
sprintf(fname,"dlsch10_r%d.m",round); sprintf(fname,"dlsch10_r%d.m",round);
sprintf(vname,"dl10_r%d.m",round); sprintf(vname,"dl10_r%d",round);
write_output(fname,vname, write_output(fname,vname,
&(PHY_vars_UE->lte_ue_common_vars.dl_ch_estimates[eNB_id][2][0]), &(PHY_vars_UE->lte_ue_common_vars.dl_ch_estimates[eNB_id][2][0]),
PHY_vars_UE->lte_frame_parms.ofdm_symbol_size*nsymb/2,1,1); PHY_vars_UE->lte_frame_parms.ofdm_symbol_size*nsymb/2,1,1);
...@@ -3456,7 +3456,7 @@ PMI_FEEDBACK: ...@@ -3456,7 +3456,7 @@ PMI_FEEDBACK:
if ((PHY_vars_UE->lte_frame_parms.nb_antennas_rx>1) && (PHY_vars_eNB->lte_frame_parms.nb_antennas_tx>1)) { if ((PHY_vars_UE->lte_frame_parms.nb_antennas_rx>1) && (PHY_vars_eNB->lte_frame_parms.nb_antennas_tx>1)) {
sprintf(fname,"dlsch11_r%d.m",round); sprintf(fname,"dlsch11_r%d.m",round);
sprintf(vname,"dl11_r%d.m",round); sprintf(vname,"dl11_r%d",round);
write_output(fname,vname, write_output(fname,vname,
&(PHY_vars_UE->lte_ue_common_vars.dl_ch_estimates[eNB_id][3][0]), &(PHY_vars_UE->lte_ue_common_vars.dl_ch_estimates[eNB_id][3][0]),
PHY_vars_UE->lte_frame_parms.ofdm_symbol_size*nsymb/2,1,1); PHY_vars_UE->lte_frame_parms.ofdm_symbol_size*nsymb/2,1,1);
...@@ -3987,7 +3987,7 @@ PMI_FEEDBACK: ...@@ -3987,7 +3987,7 @@ PMI_FEEDBACK:
printf("[continue] effective rate : %f (%2.1f%%,%f)): increase snr \n",rate*effective_rate, 100*effective_rate, rate); printf("[continue] effective rate : %f (%2.1f%%,%f)): increase snr \n",rate*effective_rate, 100*effective_rate, rate);
} }
if (((double)errs[0]/(round_trials[0]))<1e-2) if (((double)errs[0]/(round_trials[0]))<(10.0/n_frames))
break; break;
}// SNR }// SNR
......
...@@ -942,7 +942,7 @@ void do_OFDM_mod_rt(int subframe,PHY_VARS_eNB *phy_vars_eNB) ...@@ -942,7 +942,7 @@ void do_OFDM_mod_rt(int subframe,PHY_VARS_eNB *phy_vars_eNB)
{ {
unsigned int aa,slot_offset, slot_offset_F; unsigned int aa,slot_offset, slot_offset_F;
int dummy_tx_b[7680*4] __attribute__((aligned(16))); int dummy_tx_b[7680*4] __attribute__((aligned(32)));
int i, tx_offset; int i, tx_offset;
int slot_sizeF = (phy_vars_eNB->lte_frame_parms.ofdm_symbol_size)* int slot_sizeF = (phy_vars_eNB->lte_frame_parms.ofdm_symbol_size)*
((phy_vars_eNB->lte_frame_parms.Ncp==1) ? 6 : 7); ((phy_vars_eNB->lte_frame_parms.Ncp==1) ? 6 : 7);
......
Markdown is supported
0%
or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment