Commit 1e08469e authored by Raymond Knopp's avatar Raymond Knopp

conversion of some sse4 code to avx2 for turbo encoder. No noticeable...

conversion of some sse4 code to avx2 for turbo encoder.  No noticeable improvement (output expansion for encoder doesn't benefit much because of unpack instruction difference on AVX2.

maxh adjustement in dlsch_demodulation (to be checked further for other mcs)

exit criterion in dlsim to allow estimation of lower BLER.
parent 5dbe3bef
......@@ -134,7 +134,7 @@ else (CMAKE_SYSTEM_PROCESSOR STREQUAL "armv7l")
set(C_FLAGS_PROCESSOR "${C_FLAGS_PROCESSOR} -mavx2")
endif()
if (CPUINFO MATCHES "sse4_2")
set(C_FLAGS_PROCESSOR "${C_FLAGS_PROCESSOR} -mavx2 -msse4.2")
set(C_FLAGS_PROCESSOR "${C_FLAGS_PROCESSOR} -msse4.2")
endif()
if (CPUINFO MATCHES "sse4_1")
set(C_FLAGS_PROCESSOR "${C_FLAGS_PROCESSOR} -msse4.1")
......
......@@ -37,6 +37,7 @@
#include "extern_3GPPinterleaver.h"
#else
#include "vars.h"
#include <stdint.h>
#endif
#include <stdio.h>
#include <string.h>
......@@ -48,6 +49,7 @@
#define print_shorts(s,x) printf("%s %x,%x,%x,%x,%x,%x,%x,%x\n",s,(x)[0],(x)[1],(x)[2],(x)[3],(x)[4],(x)[5],(x)[6],(x)[7])
#define print_ints(s,x) printf("%s %x %x %x %x\n",s,(x)[0],(x)[1],(x)[2],(x)[3])
#define print_bytes2(s,x) printf("%s %x,%x,%x,%x,%x,%x,%x,%x,%x,%x,%x,%x,%x,%x,%x,%x,%x,%x,%x,%x,%x,%x,%x,%x,%x,%x,%x,%x,%x,%x,%x,%x\n",s,(x)[0],(x)[1],(x)[2],(x)[3],(x)[4],(x)[5],(x)[6],(x)[7],(x)[8],(x)[9],(x)[10],(x)[11],(x)[12],(x)[13],(x)[14],(x)[15],(x)[16],(x)[17],(x)[18],(x)[19],(x)[20],(x)[21],(x)[22],(x)[23],(x)[24],(x)[25],(x)[26],(x)[27],(x)[28],(x)[29],(x)[30],(x)[31])
//#define DEBUG_TURBO_ENCODER 1
#define CALLGRIND 1
......@@ -143,9 +145,10 @@ void treillis_table_init(void)
char interleave_compact_byte(short * base_interleaver,unsigned char * input, unsigned char * output, int n)
{
char expandInput[768*8] __attribute__((aligned(16)));
char expandInput[768*8] __attribute__((aligned(32)));
int i,loop=n>>4;
#if defined(__x86_64__) || defined(__i386__)
#ifndef __AVX2__
__m128i *i_128=(__m128i *)input, *o_128=(__m128i*)expandInput;
__m128i tmp1, tmp2, tmp3, tmp4;
__m128i BIT_MASK = _mm_set_epi8( 0b00000001,
......@@ -164,6 +167,43 @@ char interleave_compact_byte(short * base_interleaver,unsigned char * input, uns
0b00100000,
0b01000000,
0b10000000);
#else
__m256i *i_256=(__m256i *)input, *o_256=(__m256i*)expandInput;
__m256i tmp1, tmp2, tmp3, tmp4, tmp5, tmp6, tmp7;
__m256i BIT_MASK = _mm256_set_epi8( 0b00000001,
0b00000010,
0b00000100,
0b00001000,
0b00010000,
0b00100000,
0b01000000,
0b10000000,
0b00000001,
0b00000010,
0b00000100,
0b00001000,
0b00010000,
0b00100000,
0b01000000,
0b10000000,
0b00000001,
0b00000010,
0b00000100,
0b00001000,
0b00010000,
0b00100000,
0b01000000,
0b10000000,
0b00000001,
0b00000010,
0b00000100,
0b00001000,
0b00010000,
0b00100000,
0b01000000,
0b10000000);
#endif
#elif defined(__arm__)
uint8x16_t *i_128=(uint8x16_t *)input, *o_128=(uint8x16_t *)expandInput;
uint8x16_t tmp1,tmp2;
......@@ -187,46 +227,123 @@ char interleave_compact_byte(short * base_interleaver,unsigned char * input, uns
0b00000010,
0b00000001};
#endif
#ifndef __AVX2__
if ((n&15) > 0)
loop++;
#else
loop=n>>5;
if ((n&31) > 0)
loop++;
#endif
for (i=0; i<loop ; i++ ) {
/* int cur_byte=i<<3; */
/* for (b=0;b<8;b++) */
/* expandInput[cur_byte+b] = (input[i]&(1<<(7-b)))>>(7-b); */
#if defined(__x86_64__) || defined(__i386__)
tmp1=_mm_load_si128(i_128++);
tmp2=_mm_unpacklo_epi8(tmp1,tmp1);
tmp3=_mm_unpacklo_epi16(tmp2,tmp2);
tmp4=_mm_unpacklo_epi32(tmp3,tmp3);
#ifndef __AVX2__
tmp1=_mm_load_si128(i_128++); // tmp1 = B0,B1,...,B15
tmp2=_mm_unpacklo_epi8(tmp1,tmp1); // tmp2 = B0,B0,B1,B1,...,B7,B7
tmp3=_mm_unpacklo_epi16(tmp2,tmp2); // tmp3 = B0,B0,B0,B0,B1,B1,B1,B1,B2,B2,B2,B2,B3,B3,B3,B3
tmp4=_mm_unpacklo_epi32(tmp3,tmp3); // tmp4 - B0,B0,B0,B0,B0,B0,B0,B0,B1,B1,B1,B1,B1,B1,B1,B1
*o_128++=_mm_cmpeq_epi8(_mm_and_si128(tmp4,BIT_MASK),BIT_MASK);
tmp4=_mm_unpackhi_epi32(tmp3,tmp3);
tmp4=_mm_unpackhi_epi32(tmp3,tmp3); // tmp4 - B2,B2,B2,B2,B2,B2,B2,B2,B3,B3,B3,B3,B3,B3,B3,B3
*o_128++=_mm_cmpeq_epi8(_mm_and_si128(tmp4,BIT_MASK),BIT_MASK);;
tmp3=_mm_unpackhi_epi16(tmp2,tmp2);
tmp4=_mm_unpacklo_epi32(tmp3,tmp3);
tmp3=_mm_unpackhi_epi16(tmp2,tmp2); // tmp3 = B4,B4,B4,B4,B5,B5,B5,B5,B6,B6,B6,B6,B7,B7,B7,B7
tmp4=_mm_unpacklo_epi32(tmp3,tmp3); // tmp4 - B4,B4,B4,B4,B4,B4,B4,B4,B5,B5,B5,B5,B5,B5,B5,B5
*o_128++=_mm_cmpeq_epi8(_mm_and_si128(tmp4,BIT_MASK),BIT_MASK);;
tmp4=_mm_unpackhi_epi32(tmp3,tmp3);
tmp4=_mm_unpackhi_epi32(tmp3,tmp3); // tmp4 - B6,B6,B6,B6,B6,B6,B6,B6,B7,B7,B7,B7,B7,B7,B7,B7
*o_128++=_mm_cmpeq_epi8(_mm_and_si128(tmp4,BIT_MASK),BIT_MASK);;
tmp2=_mm_unpackhi_epi8(tmp1,tmp1);
tmp3=_mm_unpacklo_epi16(tmp2,tmp2);
tmp4=_mm_unpacklo_epi32(tmp3,tmp3);
tmp2=_mm_unpackhi_epi8(tmp1,tmp1); // tmp2 = B8,B8,B9,B9,...,B15,B15
tmp3=_mm_unpacklo_epi16(tmp2,tmp2); // tmp3 = B8,B8,B8,B8,B9,B9,B9,B9,B10,B10,B10,B10,B11,B11,B11,B11
tmp4=_mm_unpacklo_epi32(tmp3,tmp3); // tmp4 = B8,B8,B8,B8,B8,B8,B8,B8,B9,B9,B9,B9,B9,B9,B9,B9
*o_128++=_mm_cmpeq_epi8(_mm_and_si128(tmp4,BIT_MASK),BIT_MASK);;
tmp4=_mm_unpackhi_epi32(tmp3,tmp3);
tmp4=_mm_unpackhi_epi32(tmp3,tmp3); // tmp4 = B10,B10,B10,B10,B10,B10,B10,B10,B11,B11,B11,B11,B11,B11,B11,B11
*o_128++=_mm_cmpeq_epi8(_mm_and_si128(tmp4,BIT_MASK),BIT_MASK);;
tmp3=_mm_unpackhi_epi16(tmp2,tmp2);
tmp4=_mm_unpacklo_epi32(tmp3,tmp3);
tmp3=_mm_unpackhi_epi16(tmp2,tmp2); // tmp3 = B12,B12,B12,B12,B13,B13,B13,B13,B14,B14,B14,B14,B15,B15,B15,B15
tmp4=_mm_unpacklo_epi32(tmp3,tmp3); // tmp4 = B12,B12,B12,B12,B12,B12,B12,B12,B13,B13,B13,B13,B13,B13,B13,B13
*o_128++=_mm_cmpeq_epi8(_mm_and_si128(tmp4,BIT_MASK),BIT_MASK);;
tmp4=_mm_unpackhi_epi32(tmp3,tmp3);
tmp4=_mm_unpackhi_epi32(tmp3,tmp3); // tmp4 = B14,B14,B14,B14,B14,B14,B14,B14,B15,B15,B15,B15,B15,B15,B15,B15
*o_128++=_mm_cmpeq_epi8(_mm_and_si128(tmp4,BIT_MASK),BIT_MASK);;
#else
tmp1=_mm256_load_si256(i_256++); // tmp1 = B0,B1,...,B15,...,B31
//print_bytes2("in",(uint8_t*)&tmp1);
tmp2=_mm256_unpacklo_epi8(tmp1,tmp1); // tmp2 = B0,B0,B1,B1,...,B7,B7,B16,B16,B17,B17,...,B23,B23
tmp3=_mm256_unpacklo_epi16(tmp2,tmp2); // tmp3 = B0,B0,B0,B0,B1,B1,B1,B1,B2,B2,B2,B2,B3,B3,B3,B3,B16,B16,B16,B16,...,B19,B19,B19,B19
tmp4=_mm256_unpacklo_epi32(tmp3,tmp3); // tmp4 - B0,B0,B0,B0,B0,B0,B0,B0,B1,B1,B1,B1,B1,B1,B1,B1,B16,B16...,B17..,B17
tmp5=_mm256_unpackhi_epi32(tmp3,tmp3); // tmp5 - B2,B2,B2,B2,B2,B2,B2,B2,B3,B3,B3,B3,B3,B3,B3,B3,B18...,B18,B19,...,B19
tmp6=_mm256_insertf128_si256(tmp4,_mm256_extracti128_si256(tmp5,0),1); // tmp6 = B0 B1 B2 B3
tmp7=_mm256_insertf128_si256(tmp5,_mm256_extracti128_si256(tmp4,1),0); // tmp7 = B16 B17 B18 B19
//print_bytes2("tmp2",(uint8_t*)&tmp2);
//print_bytes2("tmp3",(uint8_t*)&tmp3);
//print_bytes2("tmp4",(uint8_t*)&tmp4);
//print_bytes2("tmp5",(uint8_t*)&tmp4);
//print_bytes2("tmp6",(uint8_t*)&tmp6);
//print_bytes2("tmp7",(uint8_t*)&tmp7);
o_256[0]=_mm256_cmpeq_epi8(_mm256_and_si256(tmp6,BIT_MASK),BIT_MASK);
//print_bytes2("out",(uint8_t*)o_256);
o_256[4]=_mm256_cmpeq_epi8(_mm256_and_si256(tmp7,BIT_MASK),BIT_MASK);;
//print_bytes2("out",(uint8_t*)(o_256+4));
tmp3=_mm256_unpackhi_epi16(tmp2,tmp2); // tmp3 = B4,B4,B4,B4,B5,B5,B5,B5,B6,B6,B6,B6,B7,B7,B7,B7,B20,B20,B20,B20,...,B23,B23,B23,B23
tmp4=_mm256_unpacklo_epi32(tmp3,tmp3); // tmp4 - B4,B4,B4,B4,B4,B4,B4,B4,B5,B5,B5,B5,B5,B5,B5,B5,B20,B20...,B21..,B21
tmp5=_mm256_unpackhi_epi32(tmp3,tmp3); // tmp5 - B6,B6,B6,B6,B6,B6,B6,B6,B7,B7,B7,B7,B7,B7,B7,B7,B22...,B22,B23,...,B23
tmp6=_mm256_insertf128_si256(tmp4,_mm256_extracti128_si256(tmp5,0),1); // tmp6 = B4 B5 B6 B7
tmp7=_mm256_insertf128_si256(tmp5,_mm256_extracti128_si256(tmp4,1),0); // tmp7 = B20 B21 B22 B23
//print_bytes2("tmp2",(uint8_t*)&tmp2);
//print_bytes2("tmp3",(uint8_t*)&tmp3);
//print_bytes2("tmp4",(uint8_t*)&tmp4);
//print_bytes2("tmp5",(uint8_t*)&tmp4);
//print_bytes2("tmp6",(uint8_t*)&tmp6);
//print_bytes2("tmp7",(uint8_t*)&tmp7);
o_256[1]=_mm256_cmpeq_epi8(_mm256_and_si256(tmp6,BIT_MASK),BIT_MASK);
//print_bytes2("out",(uint8_t*)(o_256+1));
o_256[5]=_mm256_cmpeq_epi8(_mm256_and_si256(tmp7,BIT_MASK),BIT_MASK);;
//print_bytes2("out",(uint8_t*)(o_256+4));
tmp2=_mm256_unpackhi_epi8(tmp1,tmp1); // tmp2 = B8 B9 B10 B11 B12 B13 B14 B15 B25 B26 B27 B28 B29 B30 B31
tmp3=_mm256_unpacklo_epi16(tmp2,tmp2); // tmp3 = B8,B9,B10,B11,B26,B27,B28,B29
tmp4=_mm256_unpacklo_epi32(tmp3,tmp3); // tmp4 - B8,B9,B26,B27
tmp5=_mm256_unpackhi_epi32(tmp3,tmp3); // tmp5 - B10,B11,B28,B29
tmp6=_mm256_insertf128_si256(tmp4,_mm256_extracti128_si256(tmp5,0),1); // tmp6 = B8 B9 B10 B11
tmp7=_mm256_insertf128_si256(tmp5,_mm256_extracti128_si256(tmp4,1),0); // tmp7 = B26 B27 B28 B29
//print_bytes2("tmp2",(uint8_t*)&tmp2);
//print_bytes2("tmp3",(uint8_t*)&tmp3);
//print_bytes2("tmp4",(uint8_t*)&tmp4);
//print_bytes2("tmp5",(uint8_t*)&tmp4);
//print_bytes2("tmp6",(uint8_t*)&tmp6);
//print_bytes2("tmp7",(uint8_t*)&tmp7);
o_256[2]=_mm256_cmpeq_epi8(_mm256_and_si256(tmp6,BIT_MASK),BIT_MASK);
//print_bytes2("out",(uint8_t*)(o_256+2));
o_256[6]=_mm256_cmpeq_epi8(_mm256_and_si256(tmp7,BIT_MASK),BIT_MASK);;
//print_bytes2("out",(uint8_t*)(o_256+4));
tmp3=_mm256_unpackhi_epi16(tmp2,tmp2); // tmp3 = B12 B13 B14 B15 B28 B29 B30 B31
tmp4=_mm256_unpacklo_epi32(tmp3,tmp3); // tmp4 = B12 B13 B28 B29
tmp5=_mm256_unpackhi_epi32(tmp3,tmp3); // tmp5 = B14 B15 B30 B31
tmp6=_mm256_insertf128_si256(tmp4,_mm256_extracti128_si256(tmp5,0),1); // tmp6 = B12 B13 B14 B15
tmp7=_mm256_insertf128_si256(tmp5,_mm256_extracti128_si256(tmp4,1),0); // tmp7 = B28 B29 B30 B31
//print_bytes2("tmp2",(uint8_t*)&tmp2);
//print_bytes2("tmp3",(uint8_t*)&tmp3);
//print_bytes2("tmp4",(uint8_t*)&tmp4);
//print_bytes2("tmp5",(uint8_t*)&tmp4);
//print_bytes2("tmp6",(uint8_t*)&tmp6);
//print_bytes2("tmp7",(uint8_t*)&tmp7);
o_256[3]=_mm256_cmpeq_epi8(_mm256_and_si256(tmp6,BIT_MASK),BIT_MASK);
//print_bytes2("out",(uint8_t*)(o_256+3));
o_256[7]=_mm256_cmpeq_epi8(_mm256_and_si256(tmp7,BIT_MASK),BIT_MASK);;
//print_bytes2("out",(uint8_t*)(o_256+7));
o_256+=8;
#endif
#elif defined(__arm__)
tmp1=vld1q_u8((uint8_t*)i_128);
//print_bytes("tmp1:",(uint8_t*)&tmp1);
......@@ -305,8 +422,13 @@ char interleave_compact_byte(short * base_interleaver,unsigned char * input, uns
short * ptr_intl=base_interleaver;
#if defined(__x86_64) || defined(__i386__)
#ifndef __AVX2__
__m128i tmp;
uint16_t *systematic2_ptr=(unsigned short *) output;
uint16_t *systematic2_ptr=(uint16_t *) output;
#else
__m256i tmp;
uint32_t *systematic2_ptr=(uint32_t *) output;
#endif
#elif defined(__arm__)
uint8x16_t tmp;
const uint8_t __attribute__ ((aligned (16))) _Powers[16]=
......@@ -316,11 +438,15 @@ char interleave_compact_byte(short * base_interleaver,unsigned char * input, uns
uint8x16_t Powers= vld1q_u8(_Powers);
uint8_t *systematic2_ptr=(uint8_t *) output;
#endif
#ifndef __AVX2
int input_length_words=n>>1;
#else
int input_length_words=n>>2;
#endif
for ( i=0; i< input_length_words ; i ++ ) {
#if defined(__x86_64__) || defined(__i386__)
#ifndef __AVX2__
tmp=_mm_insert_epi8(tmp,expandInput[*ptr_intl++],7);
tmp=_mm_insert_epi8(tmp,expandInput[*ptr_intl++],6);
tmp=_mm_insert_epi8(tmp,expandInput[*ptr_intl++],5);
......@@ -338,6 +464,41 @@ char interleave_compact_byte(short * base_interleaver,unsigned char * input, uns
tmp=_mm_insert_epi8(tmp,expandInput[*ptr_intl++],8+1);
tmp=_mm_insert_epi8(tmp,expandInput[*ptr_intl++],8+0);
*systematic2_ptr++=(unsigned short)_mm_movemask_epi8(tmp);
#else
tmp=_mm256_insert_epi8(tmp,expandInput[*ptr_intl++],7);
tmp=_mm256_insert_epi8(tmp,expandInput[*ptr_intl++],6);
tmp=_mm256_insert_epi8(tmp,expandInput[*ptr_intl++],5);
tmp=_mm256_insert_epi8(tmp,expandInput[*ptr_intl++],4);
tmp=_mm256_insert_epi8(tmp,expandInput[*ptr_intl++],3);
tmp=_mm256_insert_epi8(tmp,expandInput[*ptr_intl++],2);
tmp=_mm256_insert_epi8(tmp,expandInput[*ptr_intl++],1);
tmp=_mm256_insert_epi8(tmp,expandInput[*ptr_intl++],0);
tmp=_mm256_insert_epi8(tmp,expandInput[*ptr_intl++],8+7);
tmp=_mm256_insert_epi8(tmp,expandInput[*ptr_intl++],8+6);
tmp=_mm256_insert_epi8(tmp,expandInput[*ptr_intl++],8+5);
tmp=_mm256_insert_epi8(tmp,expandInput[*ptr_intl++],8+4);
tmp=_mm256_insert_epi8(tmp,expandInput[*ptr_intl++],8+3);
tmp=_mm256_insert_epi8(tmp,expandInput[*ptr_intl++],8+2);
tmp=_mm256_insert_epi8(tmp,expandInput[*ptr_intl++],8+1);
tmp=_mm256_insert_epi8(tmp,expandInput[*ptr_intl++],8+0);
tmp=_mm256_insert_epi8(tmp,expandInput[*ptr_intl++],16+7);
tmp=_mm256_insert_epi8(tmp,expandInput[*ptr_intl++],16+6);
tmp=_mm256_insert_epi8(tmp,expandInput[*ptr_intl++],16+5);
tmp=_mm256_insert_epi8(tmp,expandInput[*ptr_intl++],16+4);
tmp=_mm256_insert_epi8(tmp,expandInput[*ptr_intl++],16+3);
tmp=_mm256_insert_epi8(tmp,expandInput[*ptr_intl++],16+2);
tmp=_mm256_insert_epi8(tmp,expandInput[*ptr_intl++],16+1);
tmp=_mm256_insert_epi8(tmp,expandInput[*ptr_intl++],24+0);
tmp=_mm256_insert_epi8(tmp,expandInput[*ptr_intl++],24+7);
tmp=_mm256_insert_epi8(tmp,expandInput[*ptr_intl++],24+6);
tmp=_mm256_insert_epi8(tmp,expandInput[*ptr_intl++],24+5);
tmp=_mm256_insert_epi8(tmp,expandInput[*ptr_intl++],24+4);
tmp=_mm256_insert_epi8(tmp,expandInput[*ptr_intl++],24+3);
tmp=_mm256_insert_epi8(tmp,expandInput[*ptr_intl++],24+2);
tmp=_mm256_insert_epi8(tmp,expandInput[*ptr_intl++],24+1);
tmp=_mm256_insert_epi8(tmp,expandInput[*ptr_intl++],24+0);
*systematic2_ptr++=(unsigned int)_mm256_movemask_epi8(tmp);
#endif
#elif defined(__arm__)
tmp=vsetq_lane_u8(expandInput[*ptr_intl++],tmp,7);
tmp=vsetq_lane_u8(expandInput[*ptr_intl++],tmp,6);
......@@ -485,7 +646,7 @@ void threegpplte_turbo_encoder(unsigned char *input,
int main(int argc,char **argv)
{
unsigned char input[INPUT_LENGTH+16],state,state2;
unsigned char input[INPUT_LENGTH+32],state,state2;
unsigned char output[12+(3*(INPUT_LENGTH<<3))],x,z;
int i;
unsigned char out;
......@@ -510,7 +671,7 @@ int main(int argc,char **argv)
memset((void*)input,0,INPUT_LENGTH+16);
for (i=0; i<INPUT_LENGTH; i++) {
input[i] = i*219;
printf("Input %d : %x\n",i,input[i]);
printf("Input %d : %d\n",i,input[i]);
}
threegpplte_turbo_encoder(&input[0],
......
......@@ -281,7 +281,7 @@ int rx_pdsch(PHY_VARS_UE *phy_vars_ue,
// avgs = cmax(avgs,avg[(aarx<<1)+aatx]);
lte_ue_pdsch_vars[eNB_id]->log2_maxh = (log2_approx(avgs)/2);
lte_ue_pdsch_vars[eNB_id]->log2_maxh = (log2_approx(avgs)/2)+3;
// + log2_approx(frame_parms->nb_antennas_tx_eNB-1) //-1 because log2_approx counts the number of bits
// + log2_approx(frame_parms->nb_antennas_rx-1);
......
......@@ -4033,7 +4033,7 @@ PMI_FEEDBACK:
printf("[continue] effective rate : %f (%2.1f%%,%f)): increase snr \n",rate*effective_rate, 100*effective_rate, rate);
}
if (((double)errs[0]/(round_trials[0]))<1e-2)
if (((double)errs[0]/(round_trials[0]))<(10.0/n_frames))
break;
}// SNR
......
Markdown is supported
0%
or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment