Commit 1cb484f1 authored by frtabu's avatar frtabu

fix more trivial cppcheck errors and warnings

parent d171e18c
......@@ -164,9 +164,10 @@ int processoption(paramdef_t *cfgoptions, char *value) {
*/
int config_check_unknown_cmdlineopt(char *prefix) {
int unknowndetected=0;
char testprefix[CONFIG_MAXOPTLENGTH]="";
char testprefix[CONFIG_MAXOPTLENGTH];
int finalcheck = 0;
memset(testpref,0,sizeof(testprefix));
if (prefix != NULL) {
if (strcmp(prefix,CONFIG_CHECKALLSECTIONS) == 0)
finalcheck = 1;
......
......@@ -270,6 +270,7 @@ int pnf_param_resp_cb(nfapi_vnf_config_t* config, int p5_idx, nfapi_pnf_param_re
for(int i = 0; i < resp->pnf_phy.number_of_phys; ++i)
{
phy_info phy;
memset(phy,0,sizeof(phy));
phy.index = resp->pnf_phy.phy[i].phy_config_index;
printf("[VNF] (PHY:%d) phy_config_idx:%d\n", i, resp->pnf_phy.phy[i].phy_config_index);
......@@ -287,6 +288,7 @@ int pnf_param_resp_cb(nfapi_vnf_config_t* config, int p5_idx, nfapi_pnf_param_re
for(int i = 0; i < resp->pnf_rf.number_of_rfs; ++i) {
rf_info rf;
memset(rf,0,sizeof(rf));
rf.index = resp->pnf_rf.rf[i].rf_config_index;
printf("[VNF] (RF:%d) rf_config_idx:%d\n", i, resp->pnf_rf.rf[i].rf_config_index);
......@@ -897,7 +899,7 @@ int param_resp_cb(nfapi_vnf_config_t* config, int p5_idx, nfapi_param_response_t
// for now just 1
printf("[VNF] %d.%d pnf p7 %s:%d timing %d %d %d %d\n", p5_idx, phy->id, phy->remote_addr, phy->remote_port, p7_vnf->timing_window, p7_vnf->periodic_timing_period, p7_vnf->aperiodic_timing_enabled, p7_vnf->periodic_timing_period);
printf("[VNF] %d.%d pnf p7 %s:%d timing %u %u %u %u\n", p5_idx, phy->id, phy->remote_addr, phy->remote_port, p7_vnf->timing_window, p7_vnf->periodic_timing_period, p7_vnf->aperiodic_timing_enabled, p7_vnf->periodic_timing_period);
req->header.message_id = NFAPI_CONFIG_REQUEST;
req->header.phy_id = phy->id;
......@@ -919,7 +921,7 @@ int param_resp_cb(nfapi_vnf_config_t* config, int p5_idx, nfapi_param_response_t
req->nfapi_config.timing_window.tl.tag = NFAPI_NFAPI_TIMING_WINDOW_TAG;
req->nfapi_config.timing_window.value = p7_vnf->timing_window;
printf("[VNF] Timing window:%d\n", p7_vnf->timing_window);
printf("[VNF] Timing window:%u\n", p7_vnf->timing_window);
req->num_tlv++;
if(p7_vnf->periodic_timing_enabled || p7_vnf->aperiodic_timing_enabled) {
......
......@@ -26,9 +26,9 @@
date: 09.2012
*/
#ifndef TC_MAIN
#include "coding_defs.h"
#include "coding_defs.h"
#else
#include <stdint.h>
#include <stdint.h>
#endif
#include <stdio.h>
#include <string.h>
......@@ -66,11 +66,11 @@ struct treillis {
union {
uint8x8_t systematic_andp1_64[3];
char systematic_andp1_8[24];
}__attribute__((aligned(64)));
} __attribute__((aligned(64)));
union {
uint8x8_t parity2_64[3];
char parity2_8[24];
}__attribute__((aligned(64)));
} __attribute__((aligned(64)));
int exit_state;
};
#endif
......@@ -79,23 +79,20 @@ struct treillis all_treillis[8][256];
int all_treillis_initialized=0;
static inline unsigned char threegpplte_rsc(unsigned char input,unsigned char *state)
{
static inline unsigned char threegpplte_rsc(unsigned char input,unsigned char *state) {
unsigned char output;
output = (input ^ (*state>>2) ^ (*state>>1))&1;
*state = (((input<<2)^(*state>>1))^((*state>>1)<<2)^((*state)<<2))&7;
return(output);
}
static inline void threegpplte_rsc_termination(unsigned char *x,unsigned char *z,unsigned char *state)
{
static inline void threegpplte_rsc_termination(unsigned char *x,unsigned char *z,unsigned char *state) {
*z = ((*state>>2) ^ (*state)) &1;
*x = ((*state) ^ (*state>>1)) &1;
*state = (*state)>>1;
}
static void treillis_table_init(void)
{
static void treillis_table_init(void) {
//struct treillis t[][]=all_treillis;
//t=memalign(16,sizeof(struct treillis)*8*256);
int i, j,b;
......@@ -128,14 +125,12 @@ static void treillis_table_init(void)
}
char interleave_compact_byte(short * base_interleaver,unsigned char * input, unsigned char * output, int n)
{
char interleave_compact_byte(short *base_interleaver,unsigned char *input, unsigned char *output, int n) {
char expandInput[768*8] __attribute__((aligned(32)));
int i,loop=n>>4;
#if defined(__x86_64__) || defined(__i386__)
#ifndef __AVX2__
__m128i *i_128=(__m128i *)input, *o_128=(__m128i*)expandInput;
__m128i *i_128=(__m128i *)input, *o_128=(__m128i *)expandInput;
__m128i tmp1, tmp2, tmp3, tmp4;
__m128i BIT_MASK = _mm_set_epi8( 0b00000001,
0b00000010,
......@@ -153,9 +148,8 @@ char interleave_compact_byte(short * base_interleaver,unsigned char * input, uns
0b00100000,
0b01000000,
0b10000000);
#else
__m256i *i_256=(__m256i *)input, *o_256=(__m256i*)expandInput;
__m256i *i_256=(__m256i *)input, *o_256=(__m256i *)expandInput;
__m256i tmp1, tmp2, tmp3, tmp4, tmp5, tmp6, tmp7;
__m256i BIT_MASK = _mm256_set_epi8( 0b00000001,
0b00000010,
......@@ -211,25 +205,26 @@ char interleave_compact_byte(short * base_interleaver,unsigned char * input, uns
0b00001000,
0b00000100,
0b00000010,
0b00000001};
0b00000001
};
#endif
#ifndef __AVX2__
if ((n&15) > 0)
loop++;
#else
loop=n>>5;
if ((n&31) > 0)
loop++;
#endif
#endif
for (i=0; i<loop ; i++ ) {
// int cur_byte=i<<3;
// for (b=0;b<8;b++)
// expandInput[cur_byte+b] = (input[i]&(1<<(7-b)))>>(7-b);
#if defined(__x86_64__) || defined(__i386__)
#ifndef __AVX2__
tmp1=_mm_load_si128(i_128++); // tmp1 = B0,B1,...,B15
......@@ -237,29 +232,22 @@ char interleave_compact_byte(short * base_interleaver,unsigned char * input, uns
tmp3=_mm_unpacklo_epi16(tmp2,tmp2); // tmp3 = B0,B0,B0,B0,B1,B1,B1,B1,B2,B2,B2,B2,B3,B3,B3,B3
tmp4=_mm_unpacklo_epi32(tmp3,tmp3); // tmp4 - B0,B0,B0,B0,B0,B0,B0,B0,B1,B1,B1,B1,B1,B1,B1,B1
*o_128++=_mm_cmpeq_epi8(_mm_and_si128(tmp4,BIT_MASK),BIT_MASK);
tmp4=_mm_unpackhi_epi32(tmp3,tmp3); // tmp4 - B2,B2,B2,B2,B2,B2,B2,B2,B3,B3,B3,B3,B3,B3,B3,B3
*o_128++=_mm_cmpeq_epi8(_mm_and_si128(tmp4,BIT_MASK),BIT_MASK);;
tmp3=_mm_unpackhi_epi16(tmp2,tmp2); // tmp3 = B4,B4,B4,B4,B5,B5,B5,B5,B6,B6,B6,B6,B7,B7,B7,B7
tmp4=_mm_unpacklo_epi32(tmp3,tmp3); // tmp4 - B4,B4,B4,B4,B4,B4,B4,B4,B5,B5,B5,B5,B5,B5,B5,B5
*o_128++=_mm_cmpeq_epi8(_mm_and_si128(tmp4,BIT_MASK),BIT_MASK);;
tmp4=_mm_unpackhi_epi32(tmp3,tmp3); // tmp4 - B6,B6,B6,B6,B6,B6,B6,B6,B7,B7,B7,B7,B7,B7,B7,B7
*o_128++=_mm_cmpeq_epi8(_mm_and_si128(tmp4,BIT_MASK),BIT_MASK);;
tmp2=_mm_unpackhi_epi8(tmp1,tmp1); // tmp2 = B8,B8,B9,B9,...,B15,B15
tmp3=_mm_unpacklo_epi16(tmp2,tmp2); // tmp3 = B8,B8,B8,B8,B9,B9,B9,B9,B10,B10,B10,B10,B11,B11,B11,B11
tmp4=_mm_unpacklo_epi32(tmp3,tmp3); // tmp4 = B8,B8,B8,B8,B8,B8,B8,B8,B9,B9,B9,B9,B9,B9,B9,B9
*o_128++=_mm_cmpeq_epi8(_mm_and_si128(tmp4,BIT_MASK),BIT_MASK);;
tmp4=_mm_unpackhi_epi32(tmp3,tmp3); // tmp4 = B10,B10,B10,B10,B10,B10,B10,B10,B11,B11,B11,B11,B11,B11,B11,B11
*o_128++=_mm_cmpeq_epi8(_mm_and_si128(tmp4,BIT_MASK),BIT_MASK);;
tmp3=_mm_unpackhi_epi16(tmp2,tmp2); // tmp3 = B12,B12,B12,B12,B13,B13,B13,B13,B14,B14,B14,B14,B15,B15,B15,B15
tmp4=_mm_unpacklo_epi32(tmp3,tmp3); // tmp4 = B12,B12,B12,B12,B12,B12,B12,B12,B13,B13,B13,B13,B13,B13,B13,B13
*o_128++=_mm_cmpeq_epi8(_mm_and_si128(tmp4,BIT_MASK),BIT_MASK);;
tmp4=_mm_unpackhi_epi32(tmp3,tmp3); // tmp4 = B14,B14,B14,B14,B14,B14,B14,B14,B15,B15,B15,B15,B15,B15,B15,B15
*o_128++=_mm_cmpeq_epi8(_mm_and_si128(tmp4,BIT_MASK),BIT_MASK);;
#else
......@@ -281,7 +269,6 @@ char interleave_compact_byte(short * base_interleaver,unsigned char * input, uns
//print_bytes2("out",(uint8_t*)o_256);
o_256[4]=_mm256_cmpeq_epi8(_mm256_and_si256(tmp7,BIT_MASK),BIT_MASK);;
//print_bytes2("out",(uint8_t*)(o_256+4));
tmp3=_mm256_unpackhi_epi16(tmp2,tmp2); // tmp3 = B4,B4,B4,B4,B5,B5,B5,B5,B6,B6,B6,B6,B7,B7,B7,B7,B20,B20,B20,B20,...,B23,B23,B23,B23
tmp4=_mm256_unpacklo_epi32(tmp3,tmp3); // tmp4 - B4,B4,B4,B4,B4,B4,B4,B4,B5,B5,B5,B5,B5,B5,B5,B5,B20,B20...,B21..,B21
tmp5=_mm256_unpackhi_epi32(tmp3,tmp3); // tmp5 - B6,B6,B6,B6,B6,B6,B6,B6,B7,B7,B7,B7,B7,B7,B7,B7,B22...,B22,B23,...,B23
......@@ -297,7 +284,6 @@ char interleave_compact_byte(short * base_interleaver,unsigned char * input, uns
//print_bytes2("out",(uint8_t*)(o_256+1));
o_256[5]=_mm256_cmpeq_epi8(_mm256_and_si256(tmp7,BIT_MASK),BIT_MASK);;
//print_bytes2("out",(uint8_t*)(o_256+4));
tmp2=_mm256_unpackhi_epi8(tmp1,tmp1); // tmp2 = B8 B9 B10 B11 B12 B13 B14 B15 B25 B26 B27 B28 B29 B30 B31
tmp3=_mm256_unpacklo_epi16(tmp2,tmp2); // tmp3 = B8,B9,B10,B11,B26,B27,B28,B29
tmp4=_mm256_unpacklo_epi32(tmp3,tmp3); // tmp4 - B8,B9,B26,B27
......@@ -314,7 +300,6 @@ char interleave_compact_byte(short * base_interleaver,unsigned char * input, uns
//print_bytes2("out",(uint8_t*)(o_256+2));
o_256[6]=_mm256_cmpeq_epi8(_mm256_and_si256(tmp7,BIT_MASK),BIT_MASK);;
//print_bytes2("out",(uint8_t*)(o_256+4));
tmp3=_mm256_unpackhi_epi16(tmp2,tmp2); // tmp3 = B12 B13 B14 B15 B28 B29 B30 B31
tmp4=_mm256_unpacklo_epi32(tmp3,tmp3); // tmp4 = B12 B13 B28 B29
tmp5=_mm256_unpackhi_epi32(tmp3,tmp3); // tmp5 = B14 B15 B30 B31
......@@ -330,48 +315,35 @@ char interleave_compact_byte(short * base_interleaver,unsigned char * input, uns
//print_bytes2("out",(uint8_t*)(o_256+3));
o_256[7]=_mm256_cmpeq_epi8(_mm256_and_si256(tmp7,BIT_MASK),BIT_MASK);;
//print_bytes2("out",(uint8_t*)(o_256+7));
o_256+=8;
#endif
#elif defined(__arm__)
tmp1=vld1q_u8((uint8_t*)i_128);
tmp1=vld1q_u8((uint8_t *)i_128);
//print_bytes("tmp1:",(uint8_t*)&tmp1);
uint8x16x2_t temp1 = vzipq_u8(tmp1,tmp1);
tmp2 = temp1.val[0];
uint16x8x2_t temp2 = vzipq_u16((uint16x8_t)tmp2,(uint16x8_t)tmp2);
tmp3 = temp2.val[0];
uint32x4x2_t temp3 = vzipq_u32((uint32x4_t)tmp3,(uint32x4_t)tmp3);
tmp4 = temp3.val[0];
//print_bytes("tmp4:",(uint8_t*)&tmp4);
*o_128++=vceqq_u8(vandq_u8((uint8x16_t)tmp4,BIT_MASK),BIT_MASK); //1
//print_bytes("o:",(uint8_t*)(o_128-1));
tmp4 = temp3.val[1];
//print_bytes("tmp4:",(uint8_t*)&tmp4);
*o_128++=vceqq_u8(vandq_u8((uint8x16_t)tmp4,BIT_MASK),BIT_MASK); //2
//print_bytes("o:",(uint8_t*)(o_128-1));
tmp3 = temp2.val[1];
temp3 = vzipq_u32((uint32x4_t)tmp3,(uint32x4_t)tmp3);
tmp4 = temp3.val[0];
//print_bytes("tmp4:",(uint8_t*)&tmp4);
*o_128++=vceqq_u8(vandq_u8((uint8x16_t)tmp4,BIT_MASK),BIT_MASK); //3
//print_bytes("o:",(uint8_t*)(o_128-1));
tmp4 = temp3.val[1];
//print_bytes("tmp4:",(uint8_t*)&tmp4);
*o_128++=vceqq_u8(vandq_u8((uint8x16_t)tmp4,BIT_MASK),BIT_MASK); //4
//and_tmp = vandq_u8((uint8x16_t)tmp4,BIT_MASK); print_bytes("and:",and_tmp);
//print_bytes("o:",(uint8_t*)(o_128-1));
temp1 = vzipq_u8(tmp1,tmp1);
tmp2 = temp1.val[1];
temp2 = vzipq_u16((uint16x8_t)tmp2,(uint16x8_t)tmp2);
......@@ -379,38 +351,28 @@ char interleave_compact_byte(short * base_interleaver,unsigned char * input, uns
temp3 = vzipq_u32((uint32x4_t)tmp3,(uint32x4_t)tmp3);
tmp4 = temp3.val[0];
//print_bytes("tmp4:",(uint8_t*)&tmp4);
*o_128++=vceqq_u8(vandq_u8((uint8x16_t)tmp4,BIT_MASK),BIT_MASK); //5
//print_bytes("o:",(uint8_t*)(o_128-1));
tmp4 = temp3.val[1];
//print_bytes("tmp4:",(uint8_t*)&tmp4);
*o_128++=vceqq_u8(vandq_u8((uint8x16_t)tmp4,BIT_MASK),BIT_MASK); //6
//print_bytes("o:",(uint8_t*)(o_128-1));
temp2 = vzipq_u16((uint16x8_t)tmp2,(uint16x8_t)tmp2);
tmp3 = temp2.val[1];
temp3 = vzipq_u32((uint32x4_t)tmp3,(uint32x4_t)tmp3);
tmp4 = temp3.val[0];
//print_bytes("tmp4:",(uint8_t*)&tmp4);
*o_128++=vceqq_u8(vandq_u8((uint8x16_t)tmp4,BIT_MASK),BIT_MASK); //7
//print_bytes("o:",(uint8_t*)(o_128-1));
tmp4 = temp3.val[1];
//print_bytes("tmp4:",(uint8_t*)&tmp4);
*o_128++=vceqq_u8(vandq_u8((uint8x16_t)tmp4,BIT_MASK),BIT_MASK); //7
//print_bytes("o:",(uint8_t*)(o_128-1));
i_128++;
#endif
}
short * ptr_intl=base_interleaver;
short *ptr_intl=base_interleaver;
#if defined(__x86_64) || defined(__i386__)
#ifndef __AVX2__
__m128i tmp;
......@@ -423,8 +385,7 @@ char interleave_compact_byte(short * base_interleaver,unsigned char * input, uns
uint8x16_t tmp;
const uint8_t __attribute__ ((aligned (16))) _Powers[16]=
{ 1, 2, 4, 8, 16, 32, 64, 128, 1, 2, 4, 8, 16, 32, 64, 128 };
// Set the powers of 2 (do it once for all, if applicable)
// Set the powers of 2 (do it once for all, if applicable)
uint8x16_t Powers= vld1q_u8(_Powers);
uint8_t *systematic2_ptr=(uint8_t *) output;
#endif
......@@ -435,8 +396,6 @@ char interleave_compact_byte(short * base_interleaver,unsigned char * input, uns
#endif
for ( i=0; i< input_length_words ; i ++ ) {
#if defined(__x86_64__) || defined(__i386__)
#ifndef __AVX2__
tmp=_mm_insert_epi8(tmp,expandInput[*ptr_intl++],7);
......@@ -465,7 +424,6 @@ char interleave_compact_byte(short * base_interleaver,unsigned char * input, uns
tmp=_mm256_insert_epi8(tmp,expandInput[*ptr_intl++],2);
tmp=_mm256_insert_epi8(tmp,expandInput[*ptr_intl++],1);
tmp=_mm256_insert_epi8(tmp,expandInput[*ptr_intl++],0);
tmp=_mm256_insert_epi8(tmp,expandInput[*ptr_intl++],8+7);
tmp=_mm256_insert_epi8(tmp,expandInput[*ptr_intl++],8+6);
tmp=_mm256_insert_epi8(tmp,expandInput[*ptr_intl++],8+5);
......@@ -474,7 +432,6 @@ char interleave_compact_byte(short * base_interleaver,unsigned char * input, uns
tmp=_mm256_insert_epi8(tmp,expandInput[*ptr_intl++],8+2);
tmp=_mm256_insert_epi8(tmp,expandInput[*ptr_intl++],8+1);
tmp=_mm256_insert_epi8(tmp,expandInput[*ptr_intl++],8+0);
tmp=_mm256_insert_epi8(tmp,expandInput[*ptr_intl++],16+7);
tmp=_mm256_insert_epi8(tmp,expandInput[*ptr_intl++],16+6);
tmp=_mm256_insert_epi8(tmp,expandInput[*ptr_intl++],16+5);
......@@ -483,7 +440,6 @@ char interleave_compact_byte(short * base_interleaver,unsigned char * input, uns
tmp=_mm256_insert_epi8(tmp,expandInput[*ptr_intl++],16+2);
tmp=_mm256_insert_epi8(tmp,expandInput[*ptr_intl++],16+1);
tmp=_mm256_insert_epi8(tmp,expandInput[*ptr_intl++],16+0);
tmp=_mm256_insert_epi8(tmp,expandInput[*ptr_intl++],24+7);
tmp=_mm256_insert_epi8(tmp,expandInput[*ptr_intl++],24+6);
tmp=_mm256_insert_epi8(tmp,expandInput[*ptr_intl++],24+5);
......@@ -492,7 +448,6 @@ char interleave_compact_byte(short * base_interleaver,unsigned char * input, uns
tmp=_mm256_insert_epi8(tmp,expandInput[*ptr_intl++],24+2);
tmp=_mm256_insert_epi8(tmp,expandInput[*ptr_intl++],24+1);
tmp=_mm256_insert_epi8(tmp,expandInput[*ptr_intl++],24+0);
*systematic2_ptr++=(unsigned int)_mm256_movemask_epi8(tmp);
#endif
#elif defined(__arm__)
......@@ -512,11 +467,10 @@ char interleave_compact_byte(short * base_interleaver,unsigned char * input, uns
tmp=vsetq_lane_u8(expandInput[*ptr_intl++],tmp,8+2);
tmp=vsetq_lane_u8(expandInput[*ptr_intl++],tmp,8+1);
tmp=vsetq_lane_u8(expandInput[*ptr_intl++],tmp,8+0);
// Compute the mask from the input
// Compute the mask from the input
uint64x2_t Mask= vpaddlq_u32(vpaddlq_u16(vpaddlq_u8(vandq_u8(tmp, Powers))));
vst1q_lane_u8(systematic2_ptr++, (uint8x16_t)Mask, 0);
vst1q_lane_u8(systematic2_ptr++, (uint8x16_t)Mask, 8);
#endif
}
......@@ -537,14 +491,12 @@ char interleave_compact_byte(short * base_interleaver,unsigned char * input, uns
void threegpplte_turbo_encoder_sse(unsigned char *input,
unsigned short input_length_bytes,
unsigned char *output,
unsigned char F)
{
unsigned char F) {
int i;
unsigned char *x;
unsigned char state0=0,state1=0;
unsigned short input_length_bits = input_length_bytes<<3;
short * base_interleaver;
short *base_interleaver;
if ( all_treillis_initialized == 0 ) {
treillis_table_init();
......@@ -560,15 +512,12 @@ void threegpplte_turbo_encoder_sse(unsigned char *input,
base_interleaver=il_tb+f1f2mat[i].beg_index;
}
unsigned char systematic2[768] __attribute__((aligned(32)));
interleave_compact_byte(base_interleaver,input,systematic2,input_length_bytes);
#if defined(__x86_64__) || defined(__i386__)
__m64 *ptr_output=(__m64*) output;
__m64 *ptr_output=(__m64 *) output;
#elif defined(__arm__)
uint8x8_t *ptr_output=(uint8x8_t*)output;
uint8x8_t *ptr_output=(uint8x8_t *)output;
#endif
unsigned char cur_s1, cur_s2;
int code_rate;
......@@ -584,11 +533,8 @@ void threegpplte_turbo_encoder_sse(unsigned char *input,
_mm_add_pi8(all_treillis[state0][cur_s1].parity1_64[code_rate],
all_treillis[state1][cur_s2].parity2_64[code_rate]));
*/
*ptr_output++ = _mm_add_pi8(all_treillis[state0][cur_s1].systematic_andp1_64[code_rate],
all_treillis[state1][cur_s2].parity2_64[code_rate]);
#elif defined(__arm__)
*ptr_output++ = vadd_u8(all_treillis[state0][cur_s1].systematic_andp1_64[code_rate],
all_treillis[state0][cur_s1].parity2_64[code_rate]);
......@@ -600,36 +546,30 @@ void threegpplte_turbo_encoder_sse(unsigned char *input,
}
x=output+(input_length_bits*3);
// Trellis termination
threegpplte_rsc_termination(&x[0],&x[1],&state0);
#ifdef DEBUG_TURBO_ENCODER
printf("term: x0 %d, x1 %d, state0 %d\n",x[0],x[1],state0);
printf("term: x0 %u, x1 %u, state0 %d\n",x[0],x[1],state0);
#endif //DEBUG_TURBO_ENCODER
threegpplte_rsc_termination(&x[2],&x[3],&state0);
#ifdef DEBUG_TURBO_ENCODER
printf("term: x0 %d, x1 %d, state0 %d\n",x[2],x[3],state0);
printf("term: x0 %u, x1 %u, state0 %d\n",x[2],x[3],state0);
#endif //DEBUG_TURBO_ENCODER
threegpplte_rsc_termination(&x[4],&x[5],&state0);
#ifdef DEBUG_TURBO_ENCODER
printf("term: x0 %d, x1 %d, state0 %d\n",x[4],x[5],state0);
printf("term: x0 %u, x1 %u, state0 %d\n",x[4],x[5],state0);
#endif //DEBUG_TURBO_ENCODER
threegpplte_rsc_termination(&x[6],&x[7],&state1);
#ifdef DEBUG_TURBO_ENCODER
printf("term: x0 %d, x1 %d, state1 %d\n",x[6],x[7],state1);
printf("term: x0 %u, x1 %u, state1 %d\n",x[6],x[7],state1);
#endif //DEBUG_TURBO_ENCODER
threegpplte_rsc_termination(&x[8],&x[9],&state1);
#ifdef DEBUG_TURBO_ENCODER
printf("term: x0 %d, x1 %d, state1 %d\n",x[8],x[9],state1);
printf("term: x0 %u, x1 %u, state1 %d\n",x[8],x[9],state1);
#endif //DEBUG_TURBO_ENCODER
threegpplte_rsc_termination(&x[10],&x[11],&state1);
#ifdef DEBUG_TURBO_ENCODER
printf("term: x0 %d, x1 %d, state1 %d\n",x[10],x[11],state1);
printf("term: x0 %u, x1 %u, state1 %d\n",x[10],x[11],state1);
#endif //DEBUG_TURBO_ENCODER
#if defined(__x86_64__) || defined(__i386__)
_mm_empty();
......@@ -643,16 +583,17 @@ void init_encoder_sse (void) {
/* function which will be called by the shared lib loader, to check shared lib version
against main exec version. version mismatch no considered as fatal (interfaces not supposed to change)
*/
int coding_checkbuildver(char * mainexec_buildversion, char ** shlib_buildversion)
{
int coding_checkbuildver(char *mainexec_buildversion, char **shlib_buildversion) {
#ifndef PACKAGE_VERSION
#define PACKAGE_VERSION "standalone built: " __DATE__ __TIME__
#endif
*shlib_buildversion = PACKAGE_VERSION;
if (strcmp(mainexec_buildversion, *shlib_buildversion) != 0) {
fprintf(stderr,"[CODING] shared lib version %s, doesn't match main version %s, compatibility should be checked\n",
mainexec_buildversion,*shlib_buildversion);
}
return 0;
}
......@@ -661,9 +602,7 @@ int coding_checkbuildver(char * mainexec_buildversion, char ** shlib_buildversi
#define F1 21
#define F2 120
int main(int argc,char **argv)
{
int main(int argc,char **argv) {
unsigned char input[INPUT_LENGTH+32],state,state2;
unsigned char output[12+(3*(INPUT_LENGTH<<3))],x,z;
int i;
......@@ -680,16 +619,16 @@ int main(int argc,char **argv)
printf("\n");
for (state=0; state<8; state++) {
state2=state;
threegpplte_rsc_termination(&x,&z,&state2);
printf("Termination: (%d->%d) : (%d,%d)\n",state,state2,x,z);
}
memset((void*)input,0,INPUT_LENGTH+16);
memset((void *)input,0,INPUT_LENGTH+16);
for (i=0; i<INPUT_LENGTH; i++) {
input[i] = i*219;
printf("Input %d : %d\n",i,input[i]);
printf("Input %d : %u\n",i,input[i]);
}
threegpplte_turbo_encoder_sse(&input[0],
......@@ -697,11 +636,10 @@ int main(int argc,char **argv)
&output[0],
0);
for (i=0; i<12+(INPUT_LENGTH*24); i++)
printf("%u",output[i]);
for (i=0;i<12+(INPUT_LENGTH*24);i++)
printf("%d",output[i]);
printf("\n");
return(0);
}
......
......@@ -38,15 +38,15 @@
#include "PHY/sse_intrin.h"
#ifndef TEST_DEBUG
#include "PHY/defs.h"
#include "PHY/CODING/defs.h"
#include "PHY/CODING/lte_interleaver_inline.h"
#include "PHY/defs.h"
#include "PHY/CODING/defs.h"
#include "PHY/CODING/lte_interleaver_inline.h"
#else
#include "defs.h"
#include <stdio.h>
#include <stdlib.h>
#include <string.h>
#include "defs.h"
#include <stdio.h>
#include <stdlib.h>
#include <string.h>
#endif
#define SHUFFLE16(a,b,c,d,e,f,g,h) _mm_set_epi8(h==-1?-1:h*2+1, \
......@@ -75,44 +75,40 @@
#ifdef LLR8
typedef int8_t llr_t; // internal decoder LLR data is 8-bit fixed
typedef int8_t channel_t;
#define MAX 64
typedef int8_t llr_t; // internal decoder LLR data is 8-bit fixed
typedef int8_t channel_t;
#define MAX 64
#else
typedef int16_t llr_t; // internal decoder LLR data is 16-bit fixed
typedef int16_t channel_t;
#define MAX 256
typedef int16_t llr_t; // internal decoder LLR data is 16-bit fixed
typedef int16_t channel_t;
#define MAX 256
#endif
void log_map (llr_t* systematic,channel_t* y_parity, llr_t* m11, llr_t* m10, llr_t *alpha, llr_t *beta, llr_t* ext,unsigned short frame_length,unsigned char term_flag,unsigned char F,int offset8_flag,
void log_map (llr_t *systematic,channel_t *y_parity, llr_t *m11, llr_t *m10, llr_t *alpha, llr_t *beta, llr_t *ext,unsigned short frame_length,unsigned char term_flag,unsigned char F,int offset8_flag,
time_stats_t *alpha_stats,time_stats_t *beta_stats,time_stats_t *gamma_stats,time_stats_t *ext_stats);
void compute_gamma(llr_t* m11,llr_t* m10,llr_t* systematic, channel_t* y_parity, unsigned short frame_length,unsigned char term_flag);
void compute_alpha(llr_t*alpha,llr_t *beta, llr_t* m11,llr_t* m10, unsigned short frame_length,unsigned char F);
void compute_beta(llr_t*alpha, llr_t* beta,llr_t* m11,llr_t* m10, unsigned short frame_length,unsigned char F,int offset8_flag);
void compute_ext(llr_t* alpha,llr_t* beta,llr_t* m11,llr_t* m10,llr_t* extrinsic, llr_t* ap, unsigned short frame_length);
void compute_gamma(llr_t *m11,llr_t *m10,llr_t *systematic, channel_t *y_parity, unsigned short frame_length,unsigned char term_flag);
void compute_alpha(llr_t *alpha,llr_t *beta, llr_t *m11,llr_t *m10, unsigned short frame_length,unsigned char F);
void compute_beta(llr_t *alpha, llr_t *beta,llr_t *m11,llr_t *m10, unsigned short frame_length,unsigned char F,int offset8_flag);
void compute_ext(llr_t *alpha,llr_t *beta,llr_t *m11,llr_t *m10,llr_t *extrinsic, llr_t *ap, unsigned short frame_length);
void print_bytes(char *s, __m128i *x)
{
void print_bytes(char *s, __m128i *x) {
int8_t *tempb = (int8_t *)x;
printf("%s : %d,%d,%d,%d,%d,%d,%d,%d,%d,%d,%d,%d,%d,%d,%d,%d\n",s,
tempb[0],tempb[1],tempb[2],tempb[3],tempb[4],tempb[5],tempb[6],tempb[7],
tempb[8],tempb[9],tempb[10],tempb[11],tempb[12],tempb[13],tempb[14],tempb[15]);
}
void log_map(llr_t* systematic,
channel_t* y_parity,
llr_t* m11,
llr_t* m10,
void log_map(llr_t *systematic,
channel_t *y_parity,
llr_t *m11,
llr_t *m10,
llr_t *alpha,
llr_t *beta,
llr_t* ext,
llr_t *ext,
unsigned short frame_length,
unsigned char term_flag,
unsigned char F,
......@@ -120,13 +116,10 @@ void log_map(llr_t* systematic,
time_stats_t *alpha_stats,
time_stats_t *beta_stats,
time_stats_t *gamma_stats,
time_stats_t *ext_stats)
{
time_stats_t *ext_stats) {
#ifdef DEBUG_LOGMAP
msg("log_map, frame_length %d\n",frame_length);
#endif
start_meas(gamma_stats) ;
compute_gamma(m11,m10,systematic,y_parity,frame_length,term_flag) ;
stop_meas(gamma_stats);
......@@ -139,19 +132,15 @@ void log_map(llr_t* systematic,
start_meas(ext_stats) ;
compute_ext(alpha,beta,m11,m10,ext,systematic,frame_length) ;
stop_meas(ext_stats);
}
void compute_gamma(llr_t* m11,llr_t* m10,llr_t* systematic,channel_t* y_parity,
unsigned short frame_length,unsigned char term_flag)
{
void compute_gamma(llr_t *m11,llr_t *m10,llr_t *systematic,channel_t *y_parity,
unsigned short frame_length,unsigned char term_flag) {
int k,K1;
__m128i *systematic128 = (__m128i *)systematic;
__m128i *y_parity128 = (__m128i *)y_parity;
__m128i *m10_128 = (__m128i *)m10;
__m128i *m11_128 = (__m128i *)m11;
#ifdef DEBUG_LOGMAP
msg("compute_gamma, %p,%p,%p,%p,framelength %d\n",m11,m10,systematic,y_parity,frame_length);
#endif
......@@ -159,7 +148,6 @@ void compute_gamma(llr_t* m11,llr_t* m10,llr_t* systematic,channel_t* y_parity,
K1=frame_length>>3;
for (k=0; k<K1; k++) {
m11_128[k] = _mm_srai_epi16(_mm_adds_epi16(systematic128[k],y_parity128[k]),1);
m10_128[k] = _mm_srai_epi16(_mm_subs_epi16(systematic128[k],y_parity128[k]),1);
/*
......@@ -206,13 +194,11 @@ void compute_gamma(llr_t* m11,llr_t* m10,llr_t* systematic,channel_t* y_parity,
(int16_t)_mm_extract_epi16(m10_128[k],6),
(int16_t)_mm_extract_epi16(m10_128[k],7));
*/
}
// Termination
m11_128[k] = _mm_srai_epi16(_mm_adds_epi16(systematic128[k+term_flag],y_parity128[k]),1);
m10_128[k] = _mm_srai_epi16(_mm_subs_epi16(systematic128[k+term_flag],y_parity128[k]),1);
// printf("gamma (term): %d,%d, %d,%d, %d,%d\n",m11[k<<3],m10[k<<3],m11[1+(k<<3)],m10[1+(k<<3)],m11[2+(k<<3)],m10[2+(k<<3)]);
#else
register __m128i sl,sh,ypl,yph; //K128=_mm_set1_epi8(-128);
......@@ -231,7 +217,6 @@ void compute_gamma(llr_t* m11,llr_t* m10,llr_t* systematic,channel_t* y_parity,
// m10_128[k] = _mm_subs_epi8(systematic128[k],y_parity128[k]);
// m11_128[k] = _mm_sub_epi8(_mm_avg_epu8(_mm_add_epi8(systematic128[k],K128),_mm_add_epi8(y_parity128[k],K128)),K128);
// m10_128[k] = _mm_sub_epi8(_mm_avg_epu8(_mm_add_epi8(systematic128[k],K128),_mm_add_epi8(_mm_sign_epi8(y_parity128[k],K128),K128)),K128);
/*
printf("gamma %d: s %d,%d,%d,%d,%d,%d,%d,%d,%d,%d,%d,%d,%d,%d,%d,%d\n",
k,
......@@ -309,7 +294,6 @@ void compute_gamma(llr_t* m11,llr_t* m10,llr_t* systematic,channel_t* y_parity,
}
// Termination
sl = _mm_cvtepi8_epi16(systematic128[k+term_flag]);
sh = _mm_cvtepi8_epi16(_mm_srli_si128(systematic128[k],8));
ypl = _mm_cvtepi8_epi16(y_parity128[k+term_flag]);
......@@ -318,7 +302,6 @@ void compute_gamma(llr_t* m11,llr_t* m10,llr_t* systematic,channel_t* y_parity,
_mm_srai_epi16(_mm_adds_epi16(sh,yph),1));
m10_128[k] = _mm_packs_epi16(_mm_srai_epi16(_mm_subs_epi16(sl,ypl),1),
_mm_srai_epi16(_mm_subs_epi16(sh,yph),1));
// m11_128[k] = _mm_adds_epi8(systematic128[k+term_flag],y_parity128[k]);
// m10_128[k] = _mm_subs_epi8(systematic128[k+term_flag],y_parity128[k]);
// m11_128[k] = _mm_sub_epi8(_mm_avg_epu8(_mm_add_epi8(systematic128[k+term_flag],K128),_mm_add_epi8(y_parity128[k],K128)),K128);
......@@ -383,20 +366,17 @@ void compute_gamma(llr_t* m11,llr_t* m10,llr_t* systematic,channel_t* y_parity,
#endif
_mm_empty();
_m_empty();
}
#define L 40
void compute_alpha(llr_t* alpha,llr_t* beta,llr_t* m_11,llr_t* m_10,unsigned short frame_length,unsigned char F)
{
void compute_alpha(llr_t *alpha,llr_t *beta,llr_t *m_11,llr_t *m_10,unsigned short frame_length,unsigned char F) {
int k,l,l2,K1,rerun_flag=0;
__m128i *alpha128=(__m128i *)alpha,*alpha_ptr;
__m128i a0,a1,a2,a3,a4,a5,a6,a7,*m11p,*m10p;
__m128i m_b0,m_b1,m_b2,m_b3,m_b4,m_b5,m_b6,m_b7;
__m128i new0,new1,new2,new3,new4,new5,new6,new7;
__m128i alpha_max;
#ifndef LLR8
l2 = L>>3;
K1 = (frame_length>>3);
......@@ -439,19 +419,16 @@ void compute_alpha(llr_t* alpha,llr_t* beta,llr_t* m_11,llr_t* m_10,unsigned sho
}
alpha_ptr = &alpha128[0];
m11p = (__m128i*)m_11;
m10p = (__m128i*)m_10;
m11p = (__m128i *)m_11;
m10p = (__m128i *)m_10;
for (k=0;
k<l;
k++) {
a1=_mm_load_si128(&alpha_ptr[1]);
a3=_mm_load_si128(&alpha_ptr[3]);
a5=_mm_load_si128(&alpha_ptr[5]);
a7=_mm_load_si128(&alpha_ptr[7]);
m_b0 = _mm_adds_epi16(a1,*m11p); // m11
m_b4 = _mm_subs_epi16(a1,*m11p); // m00=-m11
m_b1 = _mm_subs_epi16(a3,*m10p); // m01=-m10
......@@ -460,12 +437,10 @@ void compute_alpha(llr_t* alpha,llr_t* beta,llr_t* m_11,llr_t* m_10,unsigned sho
m_b6 = _mm_subs_epi16(a5,*m10p); // m01=-m10
m_b3 = _mm_subs_epi16(a7,*m11p); // m00=-m11
m_b7 = _mm_adds_epi16(a7,*m11p); // m11
a0=_mm_load_si128(&alpha_ptr[0]);
a2=_mm_load_si128(&alpha_ptr[2]);
a4=_mm_load_si128(&alpha_ptr[4]);
a6=_mm_load_si128(&alpha_ptr[6]);
new0 = _mm_subs_epi16(a0,*m11p); // m00=-m11
new4 = _mm_adds_epi16(a0,*m11p); // m11
new1 = _mm_adds_epi16(a2,*m10p); // m10
......@@ -474,7 +449,6 @@ void compute_alpha(llr_t* alpha,llr_t* beta,llr_t* m_11,llr_t* m_10,unsigned sho
new6 = _mm_adds_epi16(a4,*m10p); // m10
new3 = _mm_adds_epi16(a6,*m11p); // m11
new7 = _mm_subs_epi16(a6,*m11p); // m00=-m11
a0 = _mm_max_epi16(m_b0,new0);
a1 = _mm_max_epi16(m_b1,new1);
a2 = _mm_max_epi16(m_b2,new2);
......@@ -483,7 +457,6 @@ void compute_alpha(llr_t* alpha,llr_t* beta,llr_t* m_11,llr_t* m_10,unsigned sho
a5 = _mm_max_epi16(m_b5,new5);
a6 = _mm_max_epi16(m_b6,new6);
a7 = _mm_max_epi16(m_b7,new7);
alpha_max = _mm_max_epi16(a0,a1);
alpha_max = _mm_max_epi16(alpha_max,a2);
alpha_max = _mm_max_epi16(alpha_max,a3);
......@@ -491,7 +464,6 @@ void compute_alpha(llr_t* alpha,llr_t* beta,llr_t* m_11,llr_t* m_10,unsigned sho
alpha_max = _mm_max_epi16(alpha_max,a5);
alpha_max = _mm_max_epi16(alpha_max,a6);
alpha_max = _mm_max_epi16(alpha_max,a7);
alpha_ptr+=8;
m11p++;
m10p++;
......@@ -503,7 +475,6 @@ void compute_alpha(llr_t* alpha,llr_t* beta,llr_t* m_11,llr_t* m_10,unsigned sho
alpha_ptr[5] = _mm_subs_epi16(a5,alpha_max);
alpha_ptr[6] = _mm_subs_epi16(a6,alpha_max);
alpha_ptr[7] = _mm_subs_epi16(a7,alpha_max);
}
/*
......@@ -981,9 +952,7 @@ void compute_alpha(llr_t* alpha,llr_t* beta,llr_t* m_11,llr_t* m_10,unsigned sho
*/
#else
if (rerun_flag == 0) {
alpha128[0] = _mm_set_epi8(-MAX/2,-MAX/2,-MAX/2,-MAX/2,-MAX/2,-MAX/2,-MAX/2,-MAX/2,-MAX/2,-MAX/2,-MAX/2,-MAX/2,-MAX/2,-MAX/2,-MAX/2,0);
alpha128[1] = _mm_set_epi8(-MAX/2,-MAX/2,-MAX/2,-MAX/2,-MAX/2,-MAX/2,-MAX/2,-MAX/2,-MAX/2,-MAX/2,-MAX/2,-MAX/2,-MAX/2,-MAX/2,-MAX/2,-MAX/2);
alpha128[2] = _mm_set_epi8(-MAX/2,-MAX/2,-MAX/2,-MAX/2,-MAX/2,-MAX/2,-MAX/2,-MAX/2,-MAX/2,-MAX/2,-MAX/2,-MAX/2,-MAX/2,-MAX/2,-MAX/2,-MAX/2);
......@@ -992,8 +961,6 @@ void compute_alpha(llr_t* alpha,llr_t* beta,llr_t* m_11,llr_t* m_10,unsigned sho
alpha128[5] = _mm_set_epi8(-MAX/2,-MAX/2,-MAX/2,-MAX/2,-MAX/2,-MAX/2,-MAX/2,-MAX/2,-MAX/2,-MAX/2,-MAX/2,-MAX/2,-MAX/2,-MAX/2,-MAX/2,-MAX/2);
alpha128[6] = _mm_set_epi8(-MAX/2,-MAX/2,-MAX/2,-MAX/2,-MAX/2,-MAX/2,-MAX/2,-MAX/2,-MAX/2,-MAX/2,-MAX/2,-MAX/2,-MAX/2,-MAX/2,-MAX/2,-MAX/2);
alpha128[7] = _mm_set_epi8(-MAX/2,-MAX/2,-MAX/2,-MAX/2,-MAX/2,-MAX/2,-MAX/2,-MAX/2,-MAX/2,-MAX/2,-MAX/2,-MAX/2,-MAX/2,-MAX/2,-MAX/2,-MAX/2);
} else {
alpha128[0] = _mm_slli_si128(alpha128[(K1<<3)],1);
alpha128[1] = _mm_slli_si128(alpha128[1+(K1<<3)],1);
......@@ -1025,15 +992,12 @@ void compute_alpha(llr_t* alpha,llr_t* beta,llr_t* m_11,llr_t* m_10,unsigned sho
print_bytes("a6:",&alpha_ptr[6]);
print_bytes("a7:",&alpha_ptr[7]);
*/
m11p = (__m128i*)m_11;
m10p = (__m128i*)m_10;
m11p = (__m128i *)m_11;
m10p = (__m128i *)m_10;
for (k=0;
k<l;
k++) {
m_b0 = _mm_adds_epi8(alpha_ptr[1],*m11p); // m11
m_b4 = _mm_subs_epi8(alpha_ptr[1],*m11p); // m00=-m11
m_b1 = _mm_subs_epi8(alpha_ptr[3],*m10p); // m01=-m10
......@@ -1042,7 +1006,6 @@ void compute_alpha(llr_t* alpha,llr_t* beta,llr_t* m_11,llr_t* m_10,unsigned sho
m_b6 = _mm_subs_epi8(alpha_ptr[5],*m10p); // m01=-m10
m_b3 = _mm_subs_epi8(alpha_ptr[7],*m11p); // m00=-m11
m_b7 = _mm_adds_epi8(alpha_ptr[7],*m11p); // m11
new0 = _mm_subs_epi8(alpha_ptr[0],*m11p); // m00=-m11
new4 = _mm_adds_epi8(alpha_ptr[0],*m11p); // m11
new1 = _mm_adds_epi8(alpha_ptr[2],*m10p); // m10
......@@ -1051,7 +1014,6 @@ void compute_alpha(llr_t* alpha,llr_t* beta,llr_t* m_11,llr_t* m_10,unsigned sho
new6 = _mm_adds_epi8(alpha_ptr[4],*m10p); // m10
new3 = _mm_adds_epi8(alpha_ptr[6],*m11p); // m11
new7 = _mm_subs_epi8(alpha_ptr[6],*m11p); // m00=-m11
alpha_ptr += 8;
m11p++;
m10p++;
......@@ -1063,8 +1025,6 @@ void compute_alpha(llr_t* alpha,llr_t* beta,llr_t* m_11,llr_t* m_10,unsigned sho
alpha_ptr[5] = _mm_max_epi8(m_b5,new5);
alpha_ptr[6] = _mm_max_epi8(m_b6,new6);
alpha_ptr[7] = _mm_max_epi8(m_b7,new7);
// compute and subtract maxima
alpha_max = _mm_max_epi8(alpha_ptr[0],alpha_ptr[1]);
alpha_max = _mm_max_epi8(alpha_max,alpha_ptr[2]);
......@@ -1073,7 +1033,6 @@ void compute_alpha(llr_t* alpha,llr_t* beta,llr_t* m_11,llr_t* m_10,unsigned sho
alpha_max = _mm_max_epi8(alpha_max,alpha_ptr[5]);
alpha_max = _mm_max_epi8(alpha_max,alpha_ptr[6]);
alpha_max = _mm_max_epi8(alpha_max,alpha_ptr[7]);
alpha_ptr[0] = _mm_subs_epi8(alpha_ptr[0],alpha_max);
alpha_ptr[1] = _mm_subs_epi8(alpha_ptr[1],alpha_max);
alpha_ptr[2] = _mm_subs_epi8(alpha_ptr[2],alpha_max);
......@@ -1109,14 +1068,11 @@ void compute_alpha(llr_t* alpha,llr_t* beta,llr_t* m_11,llr_t* m_10,unsigned sho
}
void compute_beta(llr_t* alpha,llr_t* beta,llr_t *m_11,llr_t* m_10,unsigned short frame_length,unsigned char F,int offset8_flag)
{
void compute_beta(llr_t *alpha,llr_t *beta,llr_t *m_11,llr_t *m_10,unsigned short frame_length,unsigned char F,int offset8_flag) {
int k,rerun_flag=0;
__m128i m11_128,m10_128;
__m128i m_b0,m_b1,m_b2,m_b3,m_b4,m_b5,m_b6,m_b7;
__m128i new0,new1,new2,new3,new4,new5,new6,new7;
__m128i *beta128,*alpha128,*beta_ptr;
__m128i beta_max;
int16_t m11,m10,beta0_16,beta1_16,beta2_16,beta3_16,beta4_16,beta5_16,beta6_16,beta7_16,beta0_2,beta1_2,beta2_2,beta3_2,beta_m;
......@@ -1124,30 +1080,21 @@ void compute_beta(llr_t* alpha,llr_t* beta,llr_t *m_11,llr_t* m_10,unsigned shor
#ifdef LLR8
llr_t beta2,beta3,beta4,beta5,beta6,beta7;
__m128i beta_16;
#endif
#ifdef DEBUG_LOGMAP
msg("compute_beta, %p,%p,%p,%p,framelength %d,F %d\n",
beta,m_11,m_10,alpha,frame_length,F);
#endif
// termination for beta initialization
// printf("beta init: offset8 %d\n",offset8_flag);
m11=(int16_t)m_11[2+frame_length];
m10=(int16_t)m_10[2+frame_length];
// printf("m11,m10 %d,%d\n",m11,m10);
beta0 = -m11;//M0T_TERM;
beta1 = m11;//M1T_TERM;
m11=(int16_t)m_11[1+frame_length];
m10=(int16_t)m_10[1+frame_length];
// printf("m11,m10 %d,%d\n",m11,m10);
beta0_2 = beta0-m11;//+M0T_TERM;
beta1_2 = beta0+m11;//+M1T_TERM;
beta2_2 = beta1+m10;//M2T_TERM;
......@@ -1155,7 +1102,6 @@ void compute_beta(llr_t* alpha,llr_t* beta,llr_t *m_11,llr_t* m_10,unsigned shor
m11=(int16_t)m_11[frame_length];
m10=(int16_t)m_10[frame_length];
// printf("m11,m10 %d,%d (%p)\n",m11,m10,m_11+frame_length);
beta0_16 = beta0_2-m11;//+M0T_TERM;
beta1_16 = beta0_2+m11;//+M1T_TERM;
beta2_16 = beta1_2+m10;//+M2T_TERM;
......@@ -1164,8 +1110,6 @@ void compute_beta(llr_t* alpha,llr_t* beta,llr_t *m_11,llr_t* m_10,unsigned shor
beta5_16 = beta2_2+m10;//+M5T_TERM;
beta6_16 = beta3_2+m11;//+M6T_TERM;
beta7_16 = beta3_2-m11;//+M7T_TERM;
beta_m = (beta0_16>beta1_16) ? beta0_16 : beta1_16;
beta_m = (beta_m>beta2_16) ? beta_m : beta2_16;
beta_m = (beta_m>beta3_16) ? beta_m : beta3_16;
......@@ -1173,8 +1117,6 @@ void compute_beta(llr_t* alpha,llr_t* beta,llr_t *m_11,llr_t* m_10,unsigned shor
beta_m = (beta_m>beta5_16) ? beta_m : beta5_16;
beta_m = (beta_m>beta6_16) ? beta_m : beta6_16;
beta_m = (beta_m>beta7_16) ? beta_m : beta7_16;
beta0_16=beta0_16-beta_m;
beta1_16=beta1_16-beta_m;
beta2_16=beta2_16-beta_m;
......@@ -1183,7 +1125,6 @@ void compute_beta(llr_t* alpha,llr_t* beta,llr_t *m_11,llr_t* m_10,unsigned shor
beta5_16=beta5_16-beta_m;
beta6_16=beta6_16-beta_m;
beta7_16=beta7_16-beta_m;
#ifdef LLR8
beta_16 = _mm_set_epi16(beta7_16,beta6_16,beta5_16,beta4_16,beta3_16,beta2_16,beta1_16,beta0_16);
beta_16 = _mm_packs_epi16(beta_16,beta_16);
......@@ -1199,8 +1140,8 @@ void compute_beta(llr_t* alpha,llr_t* beta,llr_t *m_11,llr_t* m_10,unsigned shor
#endif
for (rerun_flag=0;; rerun_flag=1) {
beta_ptr = (__m128i*)&beta[frame_length<<3];
alpha128 = (__m128i*)&alpha[0];
beta_ptr = (__m128i *)&beta[frame_length<<3];
alpha128 = (__m128i *)&alpha[0];
if (rerun_flag == 0) {
#ifndef LLR8
......@@ -1223,9 +1164,8 @@ void compute_beta(llr_t* alpha,llr_t* beta,llr_t *m_11,llr_t* m_10,unsigned shor
beta_ptr[7] = alpha128[7+(frame_length>>1)];
#endif
} else {
beta128 = (__m128i*)&beta[0];
beta128 = (__m128i *)&beta[0];
#ifndef LLR8
beta_ptr[0] = _mm_srli_si128(beta128[0],2);
beta_ptr[1] = _mm_srli_si128(beta128[1],2);
beta_ptr[2] = _mm_srli_si128(beta128[2],2);
......@@ -1255,7 +1195,6 @@ void compute_beta(llr_t* alpha,llr_t* beta,llr_t *m_11,llr_t* m_10,unsigned shor
beta_ptr[5] = _mm_insert_epi16(beta_ptr[5],beta5_16,7);
beta_ptr[6] = _mm_insert_epi16(beta_ptr[6],beta6_16,7);
beta_ptr[7] = _mm_insert_epi16(beta_ptr[7],beta7_16,7);
/*
beta[7+(frame_length<<3)] = beta0_16;
beta[15+(frame_length<<3)] = beta1_16;
......@@ -1277,18 +1216,15 @@ void compute_beta(llr_t* alpha,llr_t* beta,llr_t *m_11,llr_t* m_10,unsigned shor
beta_ptr[6] = _mm_insert_epi8(beta_ptr[6],beta6,15);
beta_ptr[7] = _mm_insert_epi8(beta_ptr[7],beta7,15);
} else {
}
#endif
#ifndef LLR8
int loopval=((rerun_flag==0)?0:((frame_length-L)>>3));
for (k=(frame_length>>3)-1; k>=loopval; k--) {
m11_128=((__m128i*)m_11)[k];
m10_128=((__m128i*)m_10)[k];
m11_128=((__m128i *)m_11)[k];
m10_128=((__m128i *)m_10)[k];
m_b0 = _mm_adds_epi16(beta_ptr[4],m11_128); //m11
m_b1 = _mm_subs_epi16(beta_ptr[4],m11_128); //m00
m_b2 = _mm_subs_epi16(beta_ptr[5],m10_128); //m01
......@@ -1297,7 +1233,6 @@ void compute_beta(llr_t* alpha,llr_t* beta,llr_t *m_11,llr_t* m_10,unsigned shor
m_b5 = _mm_subs_epi16(beta_ptr[6],m10_128); //m01
m_b6 = _mm_subs_epi16(beta_ptr[7],m11_128); //m00
m_b7 = _mm_adds_epi16(beta_ptr[7],m11_128); //m11
new0 = _mm_subs_epi16(beta_ptr[0],m11_128); //m00
new1 = _mm_adds_epi16(beta_ptr[0],m11_128); //m11
new2 = _mm_adds_epi16(beta_ptr[1],m10_128); //m10
......@@ -1306,9 +1241,7 @@ void compute_beta(llr_t* alpha,llr_t* beta,llr_t *m_11,llr_t* m_10,unsigned shor
new5 = _mm_adds_epi16(beta_ptr[2],m10_128); //m10
new6 = _mm_adds_epi16(beta_ptr[3],m11_128); //m11
new7 = _mm_subs_epi16(beta_ptr[3],m11_128); //m00
beta_ptr-=8;
beta_ptr[0] = _mm_max_epi16(m_b0,new0);
beta_ptr[1] = _mm_max_epi16(m_b1,new1);
beta_ptr[2] = _mm_max_epi16(m_b2,new2);
......@@ -1317,7 +1250,6 @@ void compute_beta(llr_t* alpha,llr_t* beta,llr_t *m_11,llr_t* m_10,unsigned shor
beta_ptr[5] = _mm_max_epi16(m_b5,new5);
beta_ptr[6] = _mm_max_epi16(m_b6,new6);
beta_ptr[7] = _mm_max_epi16(m_b7,new7);
beta_max = _mm_max_epi16(beta_ptr[0],beta_ptr[1]);
beta_max = _mm_max_epi16(beta_max ,beta_ptr[2]);
beta_max = _mm_max_epi16(beta_max ,beta_ptr[3]);
......@@ -1325,7 +1257,6 @@ void compute_beta(llr_t* alpha,llr_t* beta,llr_t *m_11,llr_t* m_10,unsigned shor
beta_max = _mm_max_epi16(beta_max ,beta_ptr[5]);
beta_max = _mm_max_epi16(beta_max ,beta_ptr[6]);
beta_max = _mm_max_epi16(beta_max ,beta_ptr[7]);
beta_ptr[0] = _mm_subs_epi16(beta_ptr[0],beta_max);
beta_ptr[1] = _mm_subs_epi16(beta_ptr[1],beta_max);
beta_ptr[2] = _mm_subs_epi16(beta_ptr[2],beta_max);
......@@ -1334,14 +1265,11 @@ void compute_beta(llr_t* alpha,llr_t* beta,llr_t *m_11,llr_t* m_10,unsigned shor
beta_ptr[5] = _mm_subs_epi16(beta_ptr[5],beta_max);
beta_ptr[6] = _mm_subs_epi16(beta_ptr[6],beta_max);
beta_ptr[7] = _mm_subs_epi16(beta_ptr[7],beta_max);
}
#else
#ifdef DEBUG_LOGMAP
printf("beta0 %d: %03d,%03d,%03d,%03d,%03d,%03d,%03d,%03d,%03d,%03d,%03d,%03d,%03d,%03d,%03d,%03d\n",
printf("beta0 %u: %03d,%03d,%03d,%03d,%03d,%03d,%03d,%03d,%03d,%03d,%03d,%03d,%03d,%03d,%03d,%03d\n",
(frame_length>>4),
_mm_extract_epi8(beta_ptr[0],0),
_mm_extract_epi8(beta_ptr[0],1),
......@@ -1359,7 +1287,7 @@ void compute_beta(llr_t* alpha,llr_t* beta,llr_t *m_11,llr_t* m_10,unsigned shor
_mm_extract_epi8(beta_ptr[0],13),
_mm_extract_epi8(beta_ptr[0],14),
_mm_extract_epi8(beta_ptr[0],15));
printf("beta1 %d: %03d,%03d,%03d,%03d,%03d,%03d,%03d,%03d,%03d,%03d,%03d,%03d,%03d,%03d,%03d,%03d\n",
printf("beta1 %u: %03d,%03d,%03d,%03d,%03d,%03d,%03d,%03d,%03d,%03d,%03d,%03d,%03d,%03d,%03d,%03d\n",
(frame_length>>4),
_mm_extract_epi8(beta_ptr[1],0),
_mm_extract_epi8(beta_ptr[1],1),
......@@ -1377,7 +1305,7 @@ void compute_beta(llr_t* alpha,llr_t* beta,llr_t *m_11,llr_t* m_10,unsigned shor
_mm_extract_epi8(beta_ptr[1],13),
_mm_extract_epi8(beta_ptr[1],14),
_mm_extract_epi8(beta_ptr[1],15));
printf("beta2 %d: %03d,%03d,%03d,%03d,%03d,%03d,%03d,%03d,%03d,%03d,%03d,%03d,%03d,%03d,%03d,%03d\n",
printf("beta2 %u: %03d,%03d,%03d,%03d,%03d,%03d,%03d,%03d,%03d,%03d,%03d,%03d,%03d,%03d,%03d,%03d\n",
(frame_length>>4),
_mm_extract_epi8(beta_ptr[2],0),
_mm_extract_epi8(beta_ptr[2],1),
......@@ -1395,7 +1323,7 @@ void compute_beta(llr_t* alpha,llr_t* beta,llr_t *m_11,llr_t* m_10,unsigned shor
_mm_extract_epi8(beta_ptr[2],13),
_mm_extract_epi8(beta_ptr[2],14),
_mm_extract_epi8(beta_ptr[2],15));
printf("beta3 %d: %03d,%03d,%03d,%03d,%03d,%03d,%03d,%03d,%03d,%03d,%03d,%03d,%03d,%03d,%03d,%03d\n",
printf("beta3 %u: %03d,%03d,%03d,%03d,%03d,%03d,%03d,%03d,%03d,%03d,%03d,%03d,%03d,%03d,%03d,%03d\n",
(frame_length>>4),
_mm_extract_epi8(beta_ptr[3],0),
_mm_extract_epi8(beta_ptr[3],1),
......@@ -1413,7 +1341,7 @@ void compute_beta(llr_t* alpha,llr_t* beta,llr_t *m_11,llr_t* m_10,unsigned shor
_mm_extract_epi8(beta_ptr[3],13),
_mm_extract_epi8(beta_ptr[3],14),
_mm_extract_epi8(beta_ptr[3],15));
printf("beta4 %d: %03d,%03d,%03d,%03d,%03d,%03d,%03d,%03d,%03d,%03d,%03d,%03d,%03d,%03d,%03d,%03d\n",
printf("beta4 %u: %03d,%03d,%03d,%03d,%03d,%03d,%03d,%03d,%03d,%03d,%03d,%03d,%03d,%03d,%03d,%03d\n",
(frame_length>>4),
_mm_extract_epi8(beta_ptr[4],0),
_mm_extract_epi8(beta_ptr[4],1),
......@@ -1431,7 +1359,7 @@ void compute_beta(llr_t* alpha,llr_t* beta,llr_t *m_11,llr_t* m_10,unsigned shor
_mm_extract_epi8(beta_ptr[4],13),
_mm_extract_epi8(beta_ptr[4],14),
_mm_extract_epi8(beta_ptr[4],15));
printf("beta5 %d: %03d,%03d,%03d,%03d,%03d,%03d,%03d,%03d,%03d,%03d,%03d,%03d,%03d,%03d,%03d,%03d\n",
printf("beta5 %u: %03d,%03d,%03d,%03d,%03d,%03d,%03d,%03d,%03d,%03d,%03d,%03d,%03d,%03d,%03d,%03d\n",
(frame_length>>4),
_mm_extract_epi8(beta_ptr[5],0),
_mm_extract_epi8(beta_ptr[5],1),
......@@ -1449,7 +1377,7 @@ void compute_beta(llr_t* alpha,llr_t* beta,llr_t *m_11,llr_t* m_10,unsigned shor
_mm_extract_epi8(beta_ptr[5],13),
_mm_extract_epi8(beta_ptr[5],14),
_mm_extract_epi8(beta_ptr[5],15));
printf("beta6 %d: %03d,%03d,%03d,%03d,%03d,%03d,%03d,%03d,%03d,%03d,%03d,%03d,%03d,%03d,%03d,%03d\n",
printf("beta6 %u: %03d,%03d,%03d,%03d,%03d,%03d,%03d,%03d,%03d,%03d,%03d,%03d,%03d,%03d,%03d,%03d\n",
(frame_length>>4),
_mm_extract_epi8(beta_ptr[6],0),
_mm_extract_epi8(beta_ptr[6],1),
......@@ -1467,7 +1395,7 @@ void compute_beta(llr_t* alpha,llr_t* beta,llr_t *m_11,llr_t* m_10,unsigned shor
_mm_extract_epi8(beta_ptr[6],13),
_mm_extract_epi8(beta_ptr[6],14),
_mm_extract_epi8(beta_ptr[6],15));
printf("beta7 %d: %03d,%03d,%03d,%03d,%03d,%03d,%03d,%03d,%03d,%03d,%03d,%03d,%03d,%03d,%03d,%03d\n",
printf("beta7 %u: %03d,%03d,%03d,%03d,%03d,%03d,%03d,%03d,%03d,%03d,%03d,%03d,%03d,%03d,%03d,%03d\n",
(frame_length>>4),
_mm_extract_epi8(beta_ptr[7],0),
_mm_extract_epi8(beta_ptr[7],1),
......@@ -1491,9 +1419,8 @@ void compute_beta(llr_t* alpha,llr_t* beta,llr_t *m_11,llr_t* m_10,unsigned shor
__m128i zeros=_mm_set1_epi8(0);
for (k=(frame_length>>4)-1; k>=loopval; k--) {
m11_128=((__m128i*)m_11)[k];
m10_128=((__m128i*)m_10)[k];
m11_128=((__m128i *)m_11)[k];
m10_128=((__m128i *)m_10)[k];
/*
if ((offset8_flag==1) && (k==((frame_length>>4)-9))) {
beta_ptr[0] = _mm_insert_epi8(beta_ptr[0],beta0,15);
......@@ -1506,9 +1433,6 @@ void compute_beta(llr_t* alpha,llr_t* beta,llr_t *m_11,llr_t* m_10,unsigned shor
beta_ptr[7] = _mm_insert_epi8(beta_ptr[7],beta7,15);
}*/
// print_bytes("m11:",&m11_128);
m_b0 = _mm_adds_epi8(beta_ptr[4],m11_128); //m11
m_b1 = _mm_subs_epi8(beta_ptr[4],m11_128); //m00
m_b2 = _mm_subs_epi8(beta_ptr[5],m10_128); //m01
......@@ -1517,7 +1441,6 @@ void compute_beta(llr_t* alpha,llr_t* beta,llr_t *m_11,llr_t* m_10,unsigned shor
m_b5 = _mm_subs_epi8(beta_ptr[6],m10_128); //m01
m_b6 = _mm_subs_epi8(beta_ptr[7],m11_128); //m00
m_b7 = _mm_adds_epi8(beta_ptr[7],m11_128); //m11
new0 = _mm_subs_epi8(beta_ptr[0],m11_128); //m00
new1 = _mm_adds_epi8(beta_ptr[0],m11_128); //m11
new2 = _mm_adds_epi8(beta_ptr[1],m10_128); //m10
......@@ -1526,9 +1449,7 @@ void compute_beta(llr_t* alpha,llr_t* beta,llr_t *m_11,llr_t* m_10,unsigned shor
new5 = _mm_adds_epi8(beta_ptr[2],m10_128); //m10
new6 = _mm_adds_epi8(beta_ptr[3],m11_128); //m11
new7 = _mm_subs_epi8(beta_ptr[3],m11_128); //m00
beta_ptr-=8;
beta_ptr[0] = _mm_max_epi8(m_b0,new0);
beta_ptr[1] = _mm_max_epi8(m_b1,new1);
beta_ptr[2] = _mm_max_epi8(m_b2,new2);
......@@ -1537,7 +1458,6 @@ void compute_beta(llr_t* alpha,llr_t* beta,llr_t *m_11,llr_t* m_10,unsigned shor
beta_ptr[5] = _mm_max_epi8(m_b5,new5);
beta_ptr[6] = _mm_max_epi8(m_b6,new6);
beta_ptr[7] = _mm_max_epi8(m_b7,new7);
beta_max = _mm_max_epi8(beta_ptr[0],beta_ptr[1]);
beta_max = _mm_max_epi8(beta_max ,beta_ptr[2]);
beta_max = _mm_max_epi8(beta_max ,beta_ptr[3]);
......@@ -1553,7 +1473,6 @@ void compute_beta(llr_t* alpha,llr_t* beta,llr_t *m_11,llr_t* m_10,unsigned shor
beta_ptr[5] = _mm_subs_epi8(beta_ptr[5],beta_max);
beta_ptr[6] = _mm_subs_epi8(beta_ptr[6],beta_max);
beta_ptr[7] = _mm_subs_epi8(beta_ptr[7],beta_max);
/*
printf("beta0 %d: %03d,%03d,%03d,%03d,%03d,%03d,%03d,%03d,%03d,%03d,%03d,%03d,%03d,%03d,%03d,%03d\n",
k,
......@@ -1700,7 +1619,6 @@ void compute_beta(llr_t* alpha,llr_t* beta,llr_t *m_11,llr_t* m_10,unsigned shor
_mm_extract_epi8(beta_ptr[7],14),
_mm_extract_epi8(beta_ptr[7],15));
*/
}
#endif
......@@ -1713,8 +1631,7 @@ void compute_beta(llr_t* alpha,llr_t* beta,llr_t *m_11,llr_t* m_10,unsigned shor
_m_empty();
}
void compute_ext(llr_t* alpha,llr_t* beta,llr_t* m_11,llr_t* m_10,llr_t* ext, llr_t* systematic,unsigned short frame_length)
{
void compute_ext(llr_t *alpha,llr_t *beta,llr_t *m_11,llr_t *m_10,llr_t *ext, llr_t *systematic,unsigned short frame_length) {
__m128i *alpha128=(__m128i *)alpha;
__m128i *beta128=(__m128i *)beta;
__m128i *m11_128,*m10_128,*ext_128;
......@@ -1724,26 +1641,20 @@ void compute_ext(llr_t* alpha,llr_t* beta,llr_t* m_11,llr_t* m_10,llr_t* ext, ll
__m128i m10_1,m10_2,m10_3,m10_4;
__m128i m11_1,m11_2,m11_3,m11_4;
int k;
//
// LLR computation, 8 consequtive bits per loop
//
#ifdef DEBUG_LOGMAP
msg("compute_ext, %p, %p, %p, %p, %p, %p ,framelength %d\n",alpha,beta,m_11,m_10,ext,systematic,frame_length);
#endif
alpha_ptr = alpha128;
beta_ptr = &beta128[8];
#ifndef LLR8
for (k=0; k<(frame_length>>3); k++) {
m11_128 = (__m128i*)&m_11[k<<3];
m10_128 = (__m128i*)&m_10[k<<3];
ext_128 = (__m128i*)&ext[k<<3];
m11_128 = (__m128i *)&m_11[k<<3];
m10_128 = (__m128i *)&m_10[k<<3];
ext_128 = (__m128i *)&ext[k<<3];
/*
printf("EXT %03d\n",k);
print_shorts("a0:",&alpha_ptr[0]);
......@@ -1809,23 +1720,18 @@ void compute_ext(llr_t* alpha,llr_t* beta,llr_t* m_11,llr_t* m_10,llr_t* ext, ll
m11_1 = _mm_max_epi16(m11_1,m11_2);
m11_1 = _mm_max_epi16(m11_1,m11_3);
m11_1 = _mm_max_epi16(m11_1,m11_4);
// print_shorts("m11_1:",&m11_1);
m01_1 = _mm_subs_epi16(m01_1,*m10_128);
m00_1 = _mm_subs_epi16(m00_1,*m11_128);
m10_1 = _mm_adds_epi16(m10_1,*m10_128);
m11_1 = _mm_adds_epi16(m11_1,*m11_128);
// print_shorts("m10_1:",&m10_1);
// print_shorts("m11_1:",&m11_1);
m01_1 = _mm_max_epi16(m01_1,m00_1);
m10_1 = _mm_max_epi16(m10_1,m11_1);
// print_shorts("m01_1:",&m01_1);
// print_shorts("m10_1:",&m10_1);
*ext_128 = _mm_subs_epi16(m10_1,m01_1);
/*
print_shorts("ext:",ext_128);
print_shorts("m11:",m11_128);
......@@ -1834,7 +1740,6 @@ void compute_ext(llr_t* alpha,llr_t* beta,llr_t* m_11,llr_t* m_10,llr_t* ext, ll
print_shorts("m01_1:",&m01_1);
print_shorts("syst:",systematic_128);
*/
alpha_ptr+=8;
beta_ptr+=8;
}
......@@ -1842,11 +1747,9 @@ void compute_ext(llr_t* alpha,llr_t* beta,llr_t* m_11,llr_t* m_10,llr_t* ext, ll
#else
for (k=0; k<(frame_length>>4); k++) {
m11_128 = (__m128i*)&m_11[k<<4];
m10_128 = (__m128i*)&m_10[k<<4];
ext_128 = (__m128i*)&ext[k<<4];
m11_128 = (__m128i *)&m_11[k<<4];
m10_128 = (__m128i *)&m_10[k<<4];
ext_128 = (__m128i *)&ext[k<<4];
m00_4 = _mm_adds_epi8(alpha_ptr[7],beta_ptr[3]); //ALPHA_BETA_4m00;
m11_4 = _mm_adds_epi8(alpha_ptr[7],beta_ptr[7]); //ALPHA_BETA_4m11;
m00_3 = _mm_adds_epi8(alpha_ptr[6],beta_ptr[7]); //ALPHA_BETA_3m00;
......@@ -1863,7 +1766,6 @@ void compute_ext(llr_t* alpha,llr_t* beta,llr_t* m_11,llr_t* m_10,llr_t* ext, ll
m10_2 = _mm_adds_epi8(alpha_ptr[3],beta_ptr[5]); //ALPHA_BETA_2m10;
m10_1 = _mm_adds_epi8(alpha_ptr[2],beta_ptr[1]); //ALPHA_BETA_1m10;
m01_1 = _mm_adds_epi8(alpha_ptr[2],beta_ptr[5]); //ALPHA_BETA_1m01;
m01_1 = _mm_max_epi8(m01_1,m01_2);
m01_1 = _mm_max_epi8(m01_1,m01_3);
m01_1 = _mm_max_epi8(m01_1,m01_4);
......@@ -1876,29 +1778,20 @@ void compute_ext(llr_t* alpha,llr_t* beta,llr_t* m_11,llr_t* m_10,llr_t* ext, ll
m11_1 = _mm_max_epi8(m11_1,m11_2);
m11_1 = _mm_max_epi8(m11_1,m11_3);
m11_1 = _mm_max_epi8(m11_1,m11_4);
m01_1 = _mm_subs_epi8(m01_1,*m10_128);
m00_1 = _mm_subs_epi8(m00_1,*m11_128);
m10_1 = _mm_adds_epi8(m10_1,*m10_128);
m11_1 = _mm_adds_epi8(m11_1,*m11_128);
m01_1 = _mm_max_epi8(m01_1,m00_1);
m10_1 = _mm_max_epi8(m10_1,m11_1);
*ext_128 = _mm_subs_epi8(m10_1,m01_1);
alpha_ptr+=8;
beta_ptr+=8;
}
#endif
_mm_empty();
_m_empty();
}
......@@ -1906,8 +1799,7 @@ void compute_ext(llr_t* alpha,llr_t* beta,llr_t* m_11,llr_t* m_10,llr_t* ext, ll
//int pi2[n],pi3[n+8],pi5[n+8],pi4[n+8],pi6[n+8],
int *pi2tab[188],*pi5tab[188],*pi4tab[188],*pi6tab[188];
void free_td()
{
void free_td() {
int ind;
for (ind = 0; ind < 188; ind++) {
......@@ -1918,21 +1810,17 @@ void free_td()
}
}
void init_td()
{
void init_td() {
int ind,i,i2,i3,j,n,n2,pi,pi3;
short * base_interleaver;
short *base_interleaver;
for (ind=0; ind<188; ind++) {
n = f1f2mat[ind].nb_bits;
base_interleaver=il_tb+f1f2mat[ind].beg_index;
pi2tab[ind] = malloc((n+8)*sizeof(int));
pi5tab[ind] = malloc((n+8)*sizeof(int));
pi4tab[ind] = malloc((n+8)*sizeof(int));
pi6tab[ind] = malloc((n+8)*sizeof(int));
#ifdef LLR8
if ((n&15)>0) {
......@@ -1941,7 +1829,6 @@ void init_td()
n2 = n;
for (j=0,i=0; i<n2; i++,j+=16) {
if (j>=n2)
j-=(n2-1);
......@@ -1956,10 +1843,8 @@ void init_td()
j=i2;
for (i3=0; i3<(n>>3); i3++,i++,j+=8) {
// if (j>=n)
// j-=(n-1);
pi2tab[ind][i] = j;
// printf("pi2[%d] = %d\n",i,j);
}
......@@ -1967,7 +1852,6 @@ void init_td()
#endif
for (i=0; i<n2; i++) {
pi = base_interleaver[i];//(unsigned int)threegpplte_interleaver(f1,f2,n);
pi3 = pi2tab[ind][pi];
......@@ -1975,7 +1859,6 @@ void init_td()
pi5tab[ind][pi3] = pi2tab[ind][i];
pi6tab[ind][pi] = pi2tab[ind][i];
}
}
}
......@@ -1991,33 +1874,25 @@ unsigned char phy_threegpplte_turbo_decoder(short *y,
time_stats_t *gamma_stats,
time_stats_t *ext_stats,
time_stats_t *intl1_stats,
time_stats_t *intl2_stats)
{
time_stats_t *intl2_stats) {
/* y is a pointer to the input
decoded_bytes is a pointer to the decoded output
n is the size in bits of the coded block, with the tail */
int n2;
#ifdef LLR8
llr_t y8[3*(n+16)] __attribute__((aligned(16)));
#endif
llr_t systematic0[n+16] __attribute__ ((aligned(16)));
llr_t systematic1[n+16] __attribute__ ((aligned(16)));
llr_t systematic2[n+16] __attribute__ ((aligned(16)));
llr_t yparity1[n+16] __attribute__ ((aligned(16)));
llr_t yparity2[n+16] __attribute__ ((aligned(16)));
llr_t ext[n+128] __attribute__((aligned(16)));
llr_t ext2[n+128] __attribute__((aligned(16)));
llr_t alpha[(n+16)*8] __attribute__ ((aligned(16)));
llr_t beta[(n+16)*8] __attribute__ ((aligned(16)));
llr_t m11[n+16] __attribute__ ((aligned(16)));
llr_t m10[n+16] __attribute__ ((aligned(16)));
int *pi2_p,*pi4_p,*pi5_p,*pi6_p;
llr_t *s,*s1,*s2,*yp1,*yp2,*yp;
__m128i *yp128;
......@@ -2026,12 +1901,10 @@ unsigned char phy_threegpplte_turbo_decoder(short *y,
unsigned int crc,oldcrc,crc_len;
uint8_t temp;
__m128i tmp128[(n+8)>>3];
__m128i tmp, zeros=_mm_setzero_si128();
#ifdef LLR8
__m128i MAX128=_mm_set1_epi16(MAX/2);
#endif
register __m128i tmpe;
int offset8_flag=0;
......@@ -2040,9 +1913,7 @@ unsigned char phy_threegpplte_turbo_decoder(short *y,
return 255;
}
start_meas(init_stats);
#ifdef LLR8
if ((n&15)>0) {
......@@ -2087,199 +1958,154 @@ unsigned char phy_threegpplte_turbo_decoder(short *y,
//((__m128i *)y8)[i] = _mm_packs_epi16(((__m128i *)y)[j],((__m128i *)y)[j+1]);
}
yp128 = (__m128i*)y8;
yp128 = (__m128i *)y8;
#else
yp128 = (__m128i*)y;
yp128 = (__m128i *)y;
#endif
s = systematic0;
s1 = systematic1;
s2 = systematic2;
yp1 = yparity1;
yp2 = yparity2;
#ifndef LLR8
for (i=0; i<n2; i+=8) {
pi2_p = &pi2tab[iind][i];
j=pi2_p[0];
tmpe = _mm_load_si128(yp128);
s[j] = _mm_extract_epi16(tmpe,0);
yp1[j] = _mm_extract_epi16(tmpe,1);
yp2[j] = _mm_extract_epi16(tmpe,2);
// printf("init: j %d, s[j] %d yp1[j] %d yp2[j] %d\n",j,s[j],yp1[j],yp2[j]);
j=pi2_p[1];
s[j] = _mm_extract_epi16(tmpe,3);
yp1[j] = _mm_extract_epi16(tmpe,4);
yp2[j] = _mm_extract_epi16(tmpe,5);
// printf("init: j %d, s[j] %d yp1[j] %d yp2[j] %d\n",j,s[j],yp1[j],yp2[j]);
j=pi2_p[2];
s[j] = _mm_extract_epi16(tmpe,6);
yp1[j] = _mm_extract_epi16(tmpe,7);
tmpe = _mm_load_si128(&yp128[1]);
yp2[j] = _mm_extract_epi16(tmpe,0);
// printf("init: j %d, s[j] %d yp1[j] %d yp2[j] %d\n",j,s[j],yp1[j],yp2[j]);
j=pi2_p[3];
s[j] = _mm_extract_epi16(tmpe,1);
yp1[j] = _mm_extract_epi16(tmpe,2);
yp2[j] = _mm_extract_epi16(tmpe,3);
// printf("init: j %d, s[j] %d yp1[j] %d yp2[j] %d\n",j,s[j],yp1[j],yp2[j]);
j=pi2_p[4];
s[j] = _mm_extract_epi16(tmpe,4);
yp1[j] = _mm_extract_epi16(tmpe,5);
yp2[j] = _mm_extract_epi16(tmpe,6);
// printf("init: j %d, s[j] %d yp1[j] %d yp2[j] %d\n",j,s[j],yp1[j],yp2[j]);
j=pi2_p[5];
s[j] = _mm_extract_epi16(tmpe,7);
tmpe = _mm_load_si128(&yp128[2]);
yp1[j] = _mm_extract_epi16(tmpe,0);
yp2[j] = _mm_extract_epi16(tmpe,1);
// printf("init: j %d, s[j] %d yp1[j] %d yp2[j] %d\n",j,s[j],yp1[j],yp2[j]);
j=pi2_p[6];
s[j] = _mm_extract_epi16(tmpe,2);
yp1[j] = _mm_extract_epi16(tmpe,3);
yp2[j] = _mm_extract_epi16(tmpe,4);
// printf("init: j %d, s[j] %d yp1[j] %d yp2[j] %d\n",j,s[j],yp1[j],yp2[j]);
j=pi2_p[7];
s[j] = _mm_extract_epi16(tmpe,5);
yp1[j] = _mm_extract_epi16(tmpe,6);
yp2[j] = _mm_extract_epi16(tmpe,7);
// printf("init: j %d, s[j] %d yp1[j] %d yp2[j] %d\n",j,s[j],yp1[j],yp2[j]);
yp128+=3;
}
#else
for (i=0; i<n2; i+=16) {
pi2_p = &pi2tab[iind][i];
j=pi2_p[0];
s[j] = _mm_extract_epi8(yp128[0],0);
yp1[j] = _mm_extract_epi8(yp128[0],1);
yp2[j] = _mm_extract_epi8(yp128[0],2);
// printf("init: j %d, s[j] %d yp1[j] %d yp2[j] %d\n",j,s[j],yp1[j],yp2[j]);
j=pi2_p[1];
s[j] = _mm_extract_epi8(yp128[0],3);
yp1[j] = _mm_extract_epi8(yp128[0],4);
yp2[j] = _mm_extract_epi8(yp128[0],5);
// printf("init: j %d, s[j] %d yp1[j] %d yp2[j] %d\n",j,s[j],yp1[j],yp2[j]);
j=pi2_p[2];
s[j] = _mm_extract_epi8(yp128[0],6);
yp1[j] = _mm_extract_epi8(yp128[0],7);
yp2[j] = _mm_extract_epi8(yp128[0],8);
// printf("init: j %d, s[j] %d yp1[j] %d yp2[j] %d\n",j,s[j],yp1[j],yp2[j]);
j=pi2_p[3];
s[j] = _mm_extract_epi8(yp128[0],9);
yp1[j] = _mm_extract_epi8(yp128[0],10);
yp2[j] = _mm_extract_epi8(yp128[0],11);
// printf("init: j %d, s[j] %d yp1[j] %d yp2[j] %d\n",j,s[j],yp1[j],yp2[j]);
j=pi2_p[4];
s[j] = _mm_extract_epi8(yp128[0],12);
yp1[j] = _mm_extract_epi8(yp128[0],13);
yp2[j] = _mm_extract_epi8(yp128[0],14);
// printf("init: j %d, s[j] %d yp1[j] %d yp2[j] %d\n",j,s[j],yp1[j],yp2[j]);
j=pi2_p[5];
s[j] = _mm_extract_epi8(yp128[0],15);
yp1[j] = _mm_extract_epi8(yp128[1],0);
yp2[j] = _mm_extract_epi8(yp128[1],1);
// printf("init: j %d, s[j] %d yp1[j] %d yp2[j] %d\n",j,s[j],yp1[j],yp2[j]);
j=pi2_p[6];
s[j] = _mm_extract_epi8(yp128[1],2);
yp1[j] = _mm_extract_epi8(yp128[1],3);
yp2[j] = _mm_extract_epi8(yp128[1],4);
// printf("init: j %d, s[j] %d yp1[j] %d yp2[j] %d\n",j,s[j],yp1[j],yp2[j]);
j=pi2_p[7];
s[j] = _mm_extract_epi8(yp128[1],5);
yp1[j] = _mm_extract_epi8(yp128[1],6);
yp2[j] = _mm_extract_epi8(yp128[1],7);
// printf("init: j %d, s[j] %d yp1[j] %d yp2[j] %d\n",j,s[j],yp1[j],yp2[j]);
j=pi2_p[8];
s[j] = _mm_extract_epi8(yp128[1],8);
yp1[j] = _mm_extract_epi8(yp128[1],9);
yp2[j] = _mm_extract_epi8(yp128[1],10);
// printf("init: j %d, s[j] %d yp1[j] %d yp2[j] %d\n",j,s[j],yp1[j],yp2[j]);
j=pi2_p[9];
s[j] = _mm_extract_epi8(yp128[1],11);
yp1[j] = _mm_extract_epi8(yp128[1],12);
yp2[j] = _mm_extract_epi8(yp128[1],13);
// printf("init: j %d, s[j] %d yp1[j] %d yp2[j] %d\n",j,s[j],yp1[j],yp2[j]);
j=pi2_p[10];
s[j] = _mm_extract_epi8(yp128[1],14);
yp1[j] = _mm_extract_epi8(yp128[1],15);
yp2[j] = _mm_extract_epi8(yp128[2],0);
// printf("init: j %d, s[j] %d yp1[j] %d yp2[j] %d\n",j,s[j],yp1[j],yp2[j]);
j=pi2_p[11];
s[j] = _mm_extract_epi8(yp128[2],1);
yp1[j] = _mm_extract_epi8(yp128[2],2);
yp2[j] = _mm_extract_epi8(yp128[2],3);
// printf("init: j %d, s[j] %d yp1[j] %d yp2[j] %d\n",j,s[j],yp1[j],yp2[j]);
j=pi2_p[12];
s[j] = _mm_extract_epi8(yp128[2],4);
yp1[j] = _mm_extract_epi8(yp128[2],5);
yp2[j] = _mm_extract_epi8(yp128[2],6);
// printf("init: j %d, s[j] %d yp1[j] %d yp2[j] %d\n",j,s[j],yp1[j],yp2[j]);
j=pi2_p[13];
s[j] = _mm_extract_epi8(yp128[2],7);
yp1[j] = _mm_extract_epi8(yp128[2],8);
yp2[j] = _mm_extract_epi8(yp128[2],9);
// printf("init: j %d, s[j] %d yp1[j] %d yp2[j] %d\n",j,s[j],yp1[j],yp2[j]);
j=pi2_p[14];
s[j] = _mm_extract_epi8(yp128[2],10);
yp1[j] = _mm_extract_epi8(yp128[2],11);
yp2[j] = _mm_extract_epi8(yp128[2],12);
// printf("init: j %d, s[j] %d yp1[j] %d yp2[j] %d\n",j,s[j],yp1[j],yp2[j]);
j=pi2_p[15];
s[j] = _mm_extract_epi8(yp128[2],13);
yp1[j] = _mm_extract_epi8(yp128[2],14);
yp2[j] = _mm_extract_epi8(yp128[2],15);
// printf("init: j %d, s[j] %d yp1[j] %d yp2[j] %d\n",j,s[j],yp1[j],yp2[j]);
yp128+=3;
}
#endif
yp=(llr_t*)yp128;
yp=(llr_t *)yp128;
#ifdef LLR8
if (n2>n) {
......@@ -2290,7 +2116,7 @@ unsigned char phy_threegpplte_turbo_decoder(short *y,
s1[n+4]=0;s1[n+5]=0;s1[n+6]=0;s1[n+7]=0;
s2[n]=0;s2[n+1]=0;s2[n+2]=0;s2[n+3]=0;
s2[n+4]=0;s2[n+5]=0;s2[n+6]=0;s2[n+7]=0;*/
yp=(llr_t*)(y8+n);
yp=(llr_t *)(y8+n);
}
#endif
......@@ -2341,68 +2167,55 @@ unsigned char phy_threegpplte_turbo_decoder(short *y,
#ifdef DEBUG_LOGMAP
msg("\n");
#endif //DEBUG_LOGMAP
stop_meas(init_stats);
// do log_map from first parity bit
log_map(systematic0,yparity1,m11,m10,alpha,beta,ext,n2,0,F,offset8_flag,alpha_stats,beta_stats,gamma_stats,ext_stats);
while (iteration_cnt++ < max_iterations) {
#ifdef DEBUG_LOGMAP
printf("\n*******************ITERATION %d (n %d), ext %p\n\n",iteration_cnt,n,ext);
#endif //DEBUG_LOGMAP
start_meas(intl1_stats);
#ifndef LLR8
pi4_p=pi4tab[iind];
for (i=0; i<(n2>>3); i++) { // steady-state portion
((__m128i *)systematic2)[i]=_mm_insert_epi16(((__m128i *)systematic2)[i],((llr_t*)ext)[*pi4_p++],0);
((__m128i *)systematic2)[i]=_mm_insert_epi16(((__m128i *)systematic2)[i],((llr_t*)ext)[*pi4_p++],1);
((__m128i *)systematic2)[i]=_mm_insert_epi16(((__m128i *)systematic2)[i],((llr_t*)ext)[*pi4_p++],2);
((__m128i *)systematic2)[i]=_mm_insert_epi16(((__m128i *)systematic2)[i],((llr_t*)ext)[*pi4_p++],3);
((__m128i *)systematic2)[i]=_mm_insert_epi16(((__m128i *)systematic2)[i],((llr_t*)ext)[*pi4_p++],4);
((__m128i *)systematic2)[i]=_mm_insert_epi16(((__m128i *)systematic2)[i],((llr_t*)ext)[*pi4_p++],5);
((__m128i *)systematic2)[i]=_mm_insert_epi16(((__m128i *)systematic2)[i],((llr_t*)ext)[*pi4_p++],6);
((__m128i *)systematic2)[i]=_mm_insert_epi16(((__m128i *)systematic2)[i],((llr_t*)ext)[*pi4_p++],7);
((__m128i *)systematic2)[i]=_mm_insert_epi16(((__m128i *)systematic2)[i],((llr_t *)ext)[*pi4_p++],0);
((__m128i *)systematic2)[i]=_mm_insert_epi16(((__m128i *)systematic2)[i],((llr_t *)ext)[*pi4_p++],1);
((__m128i *)systematic2)[i]=_mm_insert_epi16(((__m128i *)systematic2)[i],((llr_t *)ext)[*pi4_p++],2);
((__m128i *)systematic2)[i]=_mm_insert_epi16(((__m128i *)systematic2)[i],((llr_t *)ext)[*pi4_p++],3);
((__m128i *)systematic2)[i]=_mm_insert_epi16(((__m128i *)systematic2)[i],((llr_t *)ext)[*pi4_p++],4);
((__m128i *)systematic2)[i]=_mm_insert_epi16(((__m128i *)systematic2)[i],((llr_t *)ext)[*pi4_p++],5);
((__m128i *)systematic2)[i]=_mm_insert_epi16(((__m128i *)systematic2)[i],((llr_t *)ext)[*pi4_p++],6);
((__m128i *)systematic2)[i]=_mm_insert_epi16(((__m128i *)systematic2)[i],((llr_t *)ext)[*pi4_p++],7);
}
#else
pi4_p=pi4tab[iind];
for (i=0; i<(n2>>4); i++) { // steady-state portion
tmp=_mm_insert_epi8(tmp,((llr_t*)ext)[*pi4_p++],0);
tmp=_mm_insert_epi8(tmp,((llr_t*)ext)[*pi4_p++],1);
tmp=_mm_insert_epi8(tmp,((llr_t*)ext)[*pi4_p++],2);
tmp=_mm_insert_epi8(tmp,((llr_t*)ext)[*pi4_p++],3);
tmp=_mm_insert_epi8(tmp,((llr_t*)ext)[*pi4_p++],4);
tmp=_mm_insert_epi8(tmp,((llr_t*)ext)[*pi4_p++],5);
tmp=_mm_insert_epi8(tmp,((llr_t*)ext)[*pi4_p++],6);
tmp=_mm_insert_epi8(tmp,((llr_t*)ext)[*pi4_p++],7);
tmp=_mm_insert_epi8(tmp,((llr_t*)ext)[*pi4_p++],8);
tmp=_mm_insert_epi8(tmp,((llr_t*)ext)[*pi4_p++],9);
tmp=_mm_insert_epi8(tmp,((llr_t*)ext)[*pi4_p++],10);
tmp=_mm_insert_epi8(tmp,((llr_t*)ext)[*pi4_p++],11);
tmp=_mm_insert_epi8(tmp,((llr_t*)ext)[*pi4_p++],12);
tmp=_mm_insert_epi8(tmp,((llr_t*)ext)[*pi4_p++],13);
tmp=_mm_insert_epi8(tmp,((llr_t*)ext)[*pi4_p++],14);
((__m128i *)systematic2)[i]=_mm_insert_epi8(tmp,((llr_t*)ext)[*pi4_p++],15);
tmp=_mm_insert_epi8(tmp,((llr_t *)ext)[*pi4_p++],0);
tmp=_mm_insert_epi8(tmp,((llr_t *)ext)[*pi4_p++],1);
tmp=_mm_insert_epi8(tmp,((llr_t *)ext)[*pi4_p++],2);
tmp=_mm_insert_epi8(tmp,((llr_t *)ext)[*pi4_p++],3);
tmp=_mm_insert_epi8(tmp,((llr_t *)ext)[*pi4_p++],4);
tmp=_mm_insert_epi8(tmp,((llr_t *)ext)[*pi4_p++],5);
tmp=_mm_insert_epi8(tmp,((llr_t *)ext)[*pi4_p++],6);
tmp=_mm_insert_epi8(tmp,((llr_t *)ext)[*pi4_p++],7);
tmp=_mm_insert_epi8(tmp,((llr_t *)ext)[*pi4_p++],8);
tmp=_mm_insert_epi8(tmp,((llr_t *)ext)[*pi4_p++],9);
tmp=_mm_insert_epi8(tmp,((llr_t *)ext)[*pi4_p++],10);
tmp=_mm_insert_epi8(tmp,((llr_t *)ext)[*pi4_p++],11);
tmp=_mm_insert_epi8(tmp,((llr_t *)ext)[*pi4_p++],12);
tmp=_mm_insert_epi8(tmp,((llr_t *)ext)[*pi4_p++],13);
tmp=_mm_insert_epi8(tmp,((llr_t *)ext)[*pi4_p++],14);
((__m128i *)systematic2)[i]=_mm_insert_epi8(tmp,((llr_t *)ext)[*pi4_p++],15);
}
#endif
stop_meas(intl1_stats);
// do log_map from second parity bit
log_map(systematic2,yparity2,m11,m10,alpha,beta,ext2,n2,1,F,offset8_flag,alpha_stats,beta_stats,gamma_stats,ext_stats);
#ifndef LLR8
pi5_p=pi5tab[iind];
......@@ -2415,7 +2228,7 @@ unsigned char phy_threegpplte_turbo_decoder(short *y,
tmp=_mm_insert_epi16(tmp,ext2[*pi5_p++],5);
tmp=_mm_insert_epi16(tmp,ext2[*pi5_p++],6);
tmp=_mm_insert_epi16(tmp,ext2[*pi5_p++],7);
((__m128i *)systematic1)[i] = _mm_adds_epi16(_mm_subs_epi16(tmp,((__m128i*)ext)[i]),((__m128i *)systematic0)[i]);
((__m128i *)systematic1)[i] = _mm_adds_epi16(_mm_subs_epi16(tmp,((__m128i *)ext)[i]),((__m128i *)systematic0)[i]);
}
if (iteration_cnt>1) {
......@@ -2423,17 +2236,16 @@ unsigned char phy_threegpplte_turbo_decoder(short *y,
pi6_p=pi6tab[iind];
for (i=0; i<(n2>>3); i++) {
tmp=_mm_insert_epi16(tmp, ((llr_t*)ext2)[*pi6_p++],7);
tmp=_mm_insert_epi16(tmp, ((llr_t*)ext2)[*pi6_p++],6);
tmp=_mm_insert_epi16(tmp, ((llr_t*)ext2)[*pi6_p++],5);
tmp=_mm_insert_epi16(tmp, ((llr_t*)ext2)[*pi6_p++],4);
tmp=_mm_insert_epi16(tmp, ((llr_t*)ext2)[*pi6_p++],3);
tmp=_mm_insert_epi16(tmp, ((llr_t*)ext2)[*pi6_p++],2);
tmp=_mm_insert_epi16(tmp, ((llr_t*)ext2)[*pi6_p++],1);
tmp=_mm_insert_epi16(tmp, ((llr_t*)ext2)[*pi6_p++],0);
tmp=_mm_insert_epi16(tmp, ((llr_t *)ext2)[*pi6_p++],7);
tmp=_mm_insert_epi16(tmp, ((llr_t *)ext2)[*pi6_p++],6);
tmp=_mm_insert_epi16(tmp, ((llr_t *)ext2)[*pi6_p++],5);
tmp=_mm_insert_epi16(tmp, ((llr_t *)ext2)[*pi6_p++],4);
tmp=_mm_insert_epi16(tmp, ((llr_t *)ext2)[*pi6_p++],3);
tmp=_mm_insert_epi16(tmp, ((llr_t *)ext2)[*pi6_p++],2);
tmp=_mm_insert_epi16(tmp, ((llr_t *)ext2)[*pi6_p++],1);
tmp=_mm_insert_epi16(tmp, ((llr_t *)ext2)[*pi6_p++],0);
tmp=_mm_cmpgt_epi8(_mm_packs_epi16(tmp,zeros),zeros);
decoded_bytes[i]=(unsigned char)_mm_movemask_epi8(tmp);
}
}
......@@ -2460,8 +2272,7 @@ unsigned char phy_threegpplte_turbo_decoder(short *y,
tmp=_mm_insert_epi8(tmp,ext2[*pi5_p++],15);
//decoded_bytes_interl[i]=(uint16_t) _mm_movemask_epi8(_mm_cmpgt_epi8(tmp,zeros));
tmp128[i] = _mm_adds_epi8(((__m128i *)ext2)[i],((__m128i *)systematic2)[i]);
((__m128i *)systematic1)[i] = _mm_adds_epi8(_mm_subs_epi8(tmp,((__m128i*)ext)[i]),((__m128i *)systematic0)[i]);
((__m128i *)systematic1)[i] = _mm_adds_epi8(_mm_subs_epi8(tmp,((__m128i *)ext)[i]),((__m128i *)systematic0)[i]);
}
/* LT modification, something wrong here
......@@ -2526,7 +2337,6 @@ unsigned char phy_threegpplte_turbo_decoder(short *y,
oldcrc= *((unsigned int *)(&decoded_bytes[(n>>3)-crc_len]));
switch (crc_type) {
case CRC24_A:
oldcrc&=0x00ffffff;
crc = crc24a(&decoded_bytes[F>>3],
......@@ -2573,9 +2383,9 @@ unsigned char phy_threegpplte_turbo_decoder(short *y,
// do log_map from first parity bit
if (iteration_cnt < max_iterations) {
log_map(systematic1,yparity1,m11,m10,alpha,beta,ext,n2,0,F,offset8_flag,alpha_stats,beta_stats,gamma_stats,ext_stats);
__m128i* ext_128=(__m128i*) ext;
__m128i* s1_128=(__m128i*) systematic1;
__m128i* s0_128=(__m128i*) systematic0;
__m128i *ext_128=(__m128i *) ext;
__m128i *s1_128=(__m128i *) systematic1;
__m128i *s0_128=(__m128i *) systematic0;
#ifndef LLR8
int myloop=n2>>3;
......@@ -2601,27 +2411,21 @@ unsigned char phy_threegpplte_turbo_decoder(short *y,
#ifdef TEST_DEBUG
int test_logmap8()
{
int test_logmap8() {
unsigned char test[8];
//_declspec(align(16)) char channel_output[512];
//_declspec(align(16)) unsigned char output[512],decoded_output[16], *inPtr, *outPtr;
short channel_output[512];
unsigned char output[512],decoded_output[16];
unsigned int i,crc,ret;
test[0] = 7;
test[1] = 0xa5;
test[2] = 0x11;
test[3] = 0x92;
test[4] = 0xfe;
crc = crc24a(test,
40)>>8;
*(unsigned int*)(&test[5]) = crc;
*(unsigned int *)(&test[5]) = crc;
printf("crc24 = %x\n",crc);
threegpplte_turbo_encoder(test, //input
8, //input length bytes
......@@ -2646,20 +2450,15 @@ int test_logmap8()
0, // filler bits
0); // decoder instance
for (i=0; i<8; i++)
printf("output %d => %x (input %x)\n",i,decoded_output[i],test[i]);
printf("output %u => %x (input %x)\n",i,decoded_output[i],test[i]);
}
int main()
{
int main() {
test_logmap8();
return(0);
}
......
......@@ -41,53 +41,53 @@
#include "PHY/sse_intrin.h"
#ifndef TEST_DEBUG
#include "PHY/impl_defs_top.h"
#include "PHY/defs_common.h"
#include "PHY/CODING/coding_defs.h"
#include "PHY/CODING/lte_interleaver_inline.h"
#include "PHY/impl_defs_top.h"
#include "PHY/defs_common.h"
#include "PHY/CODING/coding_defs.h"
#include "PHY/CODING/lte_interleaver_inline.h"
#else
#include "defs.h"
#include <stdio.h>
#include <stdlib.h>
#include <string.h>
#include "defs.h"
#include <stdio.h>
#include <stdlib.h>
#include <string.h>
#endif
#ifdef MEX
#include "mex.h"
#include "mex.h"
#endif
//#define DEBUG_LOGMAP
#ifdef DEBUG_LOGMAP
#define print_shorts(s,x) fprintf(fdsse4,"%s %d,%d,%d,%d,%d,%d,%d,%d\n",s,(x)[0],(x)[1],(x)[2],(x)[3],(x)[4],(x)[5],(x)[6],(x)[7])
#define print_shorts(s,x) fprintf(fdsse4,"%s %d,%d,%d,%d,%d,%d,%d,%d\n",s,(x)[0],(x)[1],(x)[2],(x)[3],(x)[4],(x)[5],(x)[6],(x)[7])
#endif
#undef __AVX2__
#ifdef DEBUG_LOGMAP
FILE *fdsse4;
FILE *fdsse4;
#endif
typedef int16_t llr_t; // internal decoder LLR data is 16-bit fixed
typedef int16_t channel_t;
#define MAX 256
void log_map16(llr_t* systematic,channel_t* y_parity, llr_t* m11, llr_t* m10, llr_t *alpha, llr_t *beta, llr_t* ext,unsigned short frame_length,unsigned char term_flag,unsigned char F,
void log_map16(llr_t *systematic,channel_t *y_parity, llr_t *m11, llr_t *m10, llr_t *alpha, llr_t *beta, llr_t *ext,unsigned short frame_length,unsigned char term_flag,unsigned char F,
int offset8_flag,time_stats_t *alpha_stats,time_stats_t *beta_stats,time_stats_t *gamma_stats,time_stats_t *ext_stats);
void compute_gamma16(llr_t* m11,llr_t* m10,llr_t* systematic, channel_t* y_parity, unsigned short frame_length,unsigned char term_flag);
void compute_alpha16(llr_t*alpha,llr_t *beta, llr_t* m11,llr_t* m10, unsigned short frame_length,unsigned char F);
void compute_beta16(llr_t*alpha, llr_t* beta,llr_t* m11,llr_t* m10, unsigned short frame_length,unsigned char F,int offset8_flag);
void compute_ext16(llr_t* alpha,llr_t* beta,llr_t* m11,llr_t* m10,llr_t* extrinsic, llr_t* ap, unsigned short frame_length);
void compute_gamma16(llr_t *m11,llr_t *m10,llr_t *systematic, channel_t *y_parity, unsigned short frame_length,unsigned char term_flag);
void compute_alpha16(llr_t *alpha,llr_t *beta, llr_t *m11,llr_t *m10, unsigned short frame_length,unsigned char F);
void compute_beta16(llr_t *alpha, llr_t *beta,llr_t *m11,llr_t *m10, unsigned short frame_length,unsigned char F,int offset8_flag);
void compute_ext16(llr_t *alpha,llr_t *beta,llr_t *m11,llr_t *m10,llr_t *extrinsic, llr_t *ap, unsigned short frame_length);
void log_map16(llr_t* systematic,
channel_t* y_parity,
llr_t* m11,
llr_t* m10,
void log_map16(llr_t *systematic,
channel_t *y_parity,
llr_t *m11,
llr_t *m10,
llr_t *alpha,
llr_t *beta,
llr_t* ext,
llr_t *ext,
unsigned short frame_length,
unsigned char term_flag,
unsigned char F,
......@@ -95,13 +95,10 @@ void log_map16(llr_t* systematic,
time_stats_t *alpha_stats,
time_stats_t *beta_stats,
time_stats_t *gamma_stats,
time_stats_t *ext_stats)
{
time_stats_t *ext_stats) {
#ifdef DEBUG_LOGMAP
fprintf(fdsse4,"log_map, frame_length %d\n",frame_length);
#endif
start_meas(gamma_stats) ;
compute_gamma16(m11,m10,systematic,y_parity,frame_length,term_flag) ;
stop_meas(gamma_stats);
......@@ -114,13 +111,10 @@ void log_map16(llr_t* systematic,
start_meas(ext_stats) ;
compute_ext16(alpha,beta,m11,m10,ext,systematic,frame_length) ;
stop_meas(ext_stats);
}
void compute_gamma16(llr_t* m11,llr_t* m10,llr_t* systematic,channel_t* y_parity,
unsigned short frame_length,unsigned char term_flag)
{
void compute_gamma16(llr_t *m11,llr_t *m10,llr_t *systematic,channel_t *y_parity,
unsigned short frame_length,unsigned char term_flag) {
int k,K1;
#if defined(__x86_64__)||defined(__i386__)
__m128i *systematic128 = (__m128i *)systematic;
......@@ -133,18 +127,18 @@ void compute_gamma16(llr_t* m11,llr_t* m10,llr_t* systematic,channel_t* y_parity
int16x8_t *m10_128 = (int16x8_t *)m10;
int16x8_t *m11_128 = (int16x8_t *)m11;
#endif
#ifdef DEBUG_LOGMAP
fprintf(fdsse4,"compute_gamma (sse_16bit), %p,%p,%p,%p,framelength %d\n",m11,m10,systematic,y_parity,frame_length);
#endif
#ifndef __AVX2__
K1=frame_length>>3;
#else
if ((frame_length&15) > 0)
K1=(frame_length+1)>>4;
else
K1=frame_length>>4;
#endif
for (k=0; k<K1; k++) {
......@@ -153,21 +147,20 @@ void compute_gamma16(llr_t* m11,llr_t* m10,llr_t* systematic,channel_t* y_parity
m11_128[k] = _mm_srai_epi16(_mm_adds_epi16(systematic128[k],y_parity128[k]),1);
m10_128[k] = _mm_srai_epi16(_mm_subs_epi16(systematic128[k],y_parity128[k]),1);
#else
((__m256i*)m11_128)[k] = _mm256_srai_epi16(_mm256_adds_epi16(((__m256i*)systematic128)[k],((__m256i*)y_parity128)[k]),1);
((__m256i *)m11_128)[k] = _mm256_srai_epi16(_mm256_adds_epi16(((__m256i *)systematic128)[k],((__m256i *)y_parity128)[k]),1);
// ((__m256i*)m10_128)[k] = _mm256_srai_epi16(_mm256_subs_epi16(((__m256i*)y_parity128)[k],((__m256i*)systematic128)[k]),1);
((__m256i*)m10_128)[k] = _mm256_srai_epi16(_mm256_subs_epi16(((__m256i*)systematic128)[k],((__m256i*)y_parity128)[k]),1);
((__m256i *)m10_128)[k] = _mm256_srai_epi16(_mm256_subs_epi16(((__m256i *)systematic128)[k],((__m256i *)y_parity128)[k]),1);
#endif
#elif defined(__arm__)
m11_128[k] = vhaddq_s16(systematic128[k],y_parity128[k]);
m10_128[k] = vhsubq_s16(systematic128[k],y_parity128[k]);
#endif
#ifdef DEBUG_LOGMAP
fprintf(fdsse4,"Loop index k %d\n", k);
print_shorts("sys",(int16_t*)&systematic128[k]);
print_shorts("yp",(int16_t*)&y_parity128[k]);
print_shorts("m11",(int16_t*)&m11_128[k]);
print_shorts("m10",(int16_t*)&m10_128[k]);
print_shorts("sys",(int16_t *)&systematic128[k]);
print_shorts("yp",(int16_t *)&y_parity128[k]);
print_shorts("m11",(int16_t *)&m11_128[k]);
print_shorts("m10",(int16_t *)&m10_128[k]);
#endif
}
......@@ -185,20 +178,18 @@ void compute_gamma16(llr_t* m11,llr_t* m10,llr_t* systematic,channel_t* y_parity
m11_128[k] = vhaddq_s16(systematic128[k+term_flag],y_parity128[k]);
m10_128[k] = vhsubq_s16(systematic128[k+term_flag],y_parity128[k]);
#endif
#ifdef DEBUG_LOGMAP
fprintf(fdsse4,"Loop index k %d (term flag %d)\n", k,term_flag);
print_shorts("sys",(int16_t*)&systematic128[k]);
print_shorts("yp",(int16_t*)&y_parity128[k]);
print_shorts("m11",(int16_t*)&m11_128[k]);
print_shorts("m10",(int16_t*)&m10_128[k]);
fprintf(fdsse4,"Loop index k %d (term flag %d)\n", k,term_flag);
print_shorts("sys",(int16_t *)&systematic128[k]);
print_shorts("yp",(int16_t *)&y_parity128[k]);
print_shorts("m11",(int16_t *)&m11_128[k]);
print_shorts("m10",(int16_t *)&m10_128[k]);
#endif
}
#define L 40
void compute_alpha16(llr_t* alpha,llr_t* beta,llr_t* m_11,llr_t* m_10,unsigned short frame_length,unsigned char F)
{
void compute_alpha16(llr_t *alpha,llr_t *beta,llr_t *m_11,llr_t *m_10,unsigned short frame_length,unsigned char F) {
int k,l,l2,K1,rerun_flag=0;
#if defined(__x86_64__) || defined(__i386__)
__m128i *alpha128=(__m128i *)alpha,*alpha_ptr,*m11p,*m10p;
......@@ -215,7 +206,6 @@ void compute_alpha16(llr_t* alpha,llr_t* beta,llr_t* m_11,llr_t* m_10,unsigned s
__m256i m11m10_256;
__m256i alpha_max;
#endif
#elif defined(__arm__)
int16x8_t *alpha128=(int16x8_t *)alpha,*alpha_ptr;
int16x8_t a0,a1,a2,a3,a4,a5,a6,a7,*m11p,*m10p;
......@@ -228,6 +218,7 @@ void compute_alpha16(llr_t* alpha,llr_t* beta,llr_t* m_11,llr_t* m_10,unsigned s
#ifdef DEBUG_LOGMAP
fprintf(fdsse4,"compute_alpha (sse_16bit)\n");
#endif
for (l=K1;; l=l2,rerun_flag=1) {
#if defined(__x86_64__) || defined(__i386__)
alpha128 = (__m128i *)alpha;
......@@ -259,14 +250,14 @@ void compute_alpha16(llr_t* alpha,llr_t* beta,llr_t* m_11,llr_t* m_10,unsigned s
#endif
#ifdef DEBUG_LOGMAP
fprintf(fdsse4,"Initial alpha\n");
print_shorts("a0",(int16_t*)&alpha128[0]);
print_shorts("a1",(int16_t*)&alpha128[1]);
print_shorts("a2",(int16_t*)&alpha128[2]);
print_shorts("a3",(int16_t*)&alpha128[3]);
print_shorts("a4",(int16_t*)&alpha128[4]);
print_shorts("a5",(int16_t*)&alpha128[5]);
print_shorts("a6",(int16_t*)&alpha128[6]);
print_shorts("a7",(int16_t*)&alpha128[7]);
print_shorts("a0",(int16_t *)&alpha128[0]);
print_shorts("a1",(int16_t *)&alpha128[1]);
print_shorts("a2",(int16_t *)&alpha128[2]);
print_shorts("a3",(int16_t *)&alpha128[3]);
print_shorts("a4",(int16_t *)&alpha128[4]);
print_shorts("a5",(int16_t *)&alpha128[5]);
print_shorts("a6",(int16_t *)&alpha128[6]);
print_shorts("a7",(int16_t *)&alpha128[7]);
#endif
} else {
//set initial alpha in columns 1-7 from final alpha from last run in columns 0-6
......@@ -280,14 +271,22 @@ void compute_alpha16(llr_t* alpha,llr_t* beta,llr_t* m_11,llr_t* m_10,unsigned s
alpha128[6] = _mm_slli_si128(alpha128[6+frame_length],2);
alpha128[7] = _mm_slli_si128(alpha128[7+frame_length],2);
#elif defined(__arm__)
alpha128[0] = (int16x8_t)vshlq_n_s64((int64x2_t)alpha128[frame_length],16); alpha128[0] = vsetq_lane_s16(alpha[8],alpha128[0],3);
alpha128[1] = (int16x8_t)vshlq_n_s64((int64x2_t)alpha128[1+frame_length],16); alpha128[1] = vsetq_lane_s16(alpha[24],alpha128[0],3);
alpha128[2] = (int16x8_t)vshlq_n_s64((int64x2_t)alpha128[2+frame_length],16); alpha128[2] = vsetq_lane_s16(alpha[40],alpha128[0],3);
alpha128[3] = (int16x8_t)vshlq_n_s64((int64x2_t)alpha128[3+frame_length],16); alpha128[3] = vsetq_lane_s16(alpha[56],alpha128[0],3);
alpha128[4] = (int16x8_t)vshlq_n_s64((int64x2_t)alpha128[4+frame_length],16); alpha128[4] = vsetq_lane_s16(alpha[72],alpha128[0],3);
alpha128[5] = (int16x8_t)vshlq_n_s64((int64x2_t)alpha128[5+frame_length],16); alpha128[5] = vsetq_lane_s16(alpha[88],alpha128[0],3);
alpha128[6] = (int16x8_t)vshlq_n_s64((int64x2_t)alpha128[6+frame_length],16); alpha128[6] = vsetq_lane_s16(alpha[104],alpha128[0],3);
alpha128[7] = (int16x8_t)vshlq_n_s64((int64x2_t)alpha128[7+frame_length],16); alpha128[7] = vsetq_lane_s16(alpha[120],alpha128[0],3);
alpha128[0] = (int16x8_t)vshlq_n_s64((int64x2_t)alpha128[frame_length],16);
alpha128[0] = vsetq_lane_s16(alpha[8],alpha128[0],3);
alpha128[1] = (int16x8_t)vshlq_n_s64((int64x2_t)alpha128[1+frame_length],16);
alpha128[1] = vsetq_lane_s16(alpha[24],alpha128[0],3);
alpha128[2] = (int16x8_t)vshlq_n_s64((int64x2_t)alpha128[2+frame_length],16);
alpha128[2] = vsetq_lane_s16(alpha[40],alpha128[0],3);
alpha128[3] = (int16x8_t)vshlq_n_s64((int64x2_t)alpha128[3+frame_length],16);
alpha128[3] = vsetq_lane_s16(alpha[56],alpha128[0],3);
alpha128[4] = (int16x8_t)vshlq_n_s64((int64x2_t)alpha128[4+frame_length],16);
alpha128[4] = vsetq_lane_s16(alpha[72],alpha128[0],3);
alpha128[5] = (int16x8_t)vshlq_n_s64((int64x2_t)alpha128[5+frame_length],16);
alpha128[5] = vsetq_lane_s16(alpha[88],alpha128[0],3);
alpha128[6] = (int16x8_t)vshlq_n_s64((int64x2_t)alpha128[6+frame_length],16);
alpha128[6] = vsetq_lane_s16(alpha[104],alpha128[0],3);
alpha128[7] = (int16x8_t)vshlq_n_s64((int64x2_t)alpha128[7+frame_length],16);
alpha128[7] = vsetq_lane_s16(alpha[120],alpha128[0],3);
#endif
// set initial alpha in column 0 to (0,-MAX/2,...,-MAX/2)
alpha[8] = -MAX/2;
......@@ -299,31 +298,30 @@ void compute_alpha16(llr_t* alpha,llr_t* beta,llr_t* m_11,llr_t* m_10,unsigned s
alpha[56] = -MAX/2;
#ifdef DEBUG_LOGMAP
fprintf(fdsse4,"Second run\n");
print_shorts("a0",(int16_t*)&alpha128[0]);
print_shorts("a1",(int16_t*)&alpha128[1]);
print_shorts("a2",(int16_t*)&alpha128[2]);
print_shorts("a3",(int16_t*)&alpha128[3]);
print_shorts("a4",(int16_t*)&alpha128[4]);
print_shorts("a5",(int16_t*)&alpha128[5]);
print_shorts("a6",(int16_t*)&alpha128[6]);
print_shorts("a7",(int16_t*)&alpha128[7]);
print_shorts("a0",(int16_t *)&alpha128[0]);
print_shorts("a1",(int16_t *)&alpha128[1]);
print_shorts("a2",(int16_t *)&alpha128[2]);
print_shorts("a3",(int16_t *)&alpha128[3]);
print_shorts("a4",(int16_t *)&alpha128[4]);
print_shorts("a5",(int16_t *)&alpha128[5]);
print_shorts("a6",(int16_t *)&alpha128[6]);
print_shorts("a7",(int16_t *)&alpha128[7]);
#endif
}
alpha_ptr = &alpha128[0];
//#ifdef __AVX2__
#if defined(__x86_64__) || defined(__i386__)
m11p = (__m128i*)m_11;
m10p = (__m128i*)m_10;
m11p = (__m128i *)m_11;
m10p = (__m128i *)m_10;
#elif defined(__arm__)
m11p = (int16x8_t*)m_11;
m10p = (int16x8_t*)m_10;
m11p = (int16x8_t *)m_11;
m10p = (int16x8_t *)m_10;
#endif
for (k=0;
k<l;
k++) {
#if defined(__x86_64__) || defined(__i386__)
//#ifndef __AVX2__
#if 1
......@@ -331,7 +329,6 @@ void compute_alpha16(llr_t* alpha,llr_t* beta,llr_t* m_11,llr_t* m_10,unsigned s
a3=_mm_load_si128(&alpha_ptr[3]);
a5=_mm_load_si128(&alpha_ptr[5]);
a7=_mm_load_si128(&alpha_ptr[7]);
m_b0 = _mm_adds_epi16(a1,*m11p); // m11
m_b4 = _mm_subs_epi16(a1,*m11p); // m00=-m11
m_b1 = _mm_subs_epi16(a3,*m10p); // m01=-m10
......@@ -340,12 +337,10 @@ void compute_alpha16(llr_t* alpha,llr_t* beta,llr_t* m_11,llr_t* m_10,unsigned s
m_b6 = _mm_subs_epi16(a5,*m10p); // m01=-m10
m_b3 = _mm_subs_epi16(a7,*m11p); // m00=-m11
m_b7 = _mm_adds_epi16(a7,*m11p); // m11
a0=_mm_load_si128(&alpha_ptr[0]);
a2=_mm_load_si128(&alpha_ptr[2]);
a4=_mm_load_si128(&alpha_ptr[4]);
a6=_mm_load_si128(&alpha_ptr[6]);
new0 = _mm_subs_epi16(a0,*m11p); // m00=-m11
new4 = _mm_adds_epi16(a0,*m11p); // m11
new1 = _mm_adds_epi16(a2,*m10p); // m10
......@@ -354,7 +349,6 @@ void compute_alpha16(llr_t* alpha,llr_t* beta,llr_t* m_11,llr_t* m_10,unsigned s
new6 = _mm_adds_epi16(a4,*m10p); // m10
new3 = _mm_adds_epi16(a6,*m11p); // m11
new7 = _mm_subs_epi16(a6,*m11p); // m00=-m11
a0 = _mm_max_epi16(m_b0,new0);
a1 = _mm_max_epi16(m_b1,new1);
a2 = _mm_max_epi16(m_b2,new2);
......@@ -363,7 +357,6 @@ void compute_alpha16(llr_t* alpha,llr_t* beta,llr_t* m_11,llr_t* m_10,unsigned s
a5 = _mm_max_epi16(m_b5,new5);
a6 = _mm_max_epi16(m_b6,new6);
a7 = _mm_max_epi16(m_b7,new7);
alpha_max = _mm_max_epi16(a0,a1);
alpha_max = _mm_max_epi16(alpha_max,a2);
alpha_max = _mm_max_epi16(alpha_max,a3);
......@@ -378,29 +371,22 @@ void compute_alpha16(llr_t* alpha,llr_t* beta,llr_t* m_11,llr_t* m_10,unsigned s
a75=_mm256_load_si256(&alpha_ptr256[3]);
m11m10_256 = _mm256_insertf128_si256(m11m10_256,*m11p,0);
m11m10_256 = _mm256_insertf128_si256(m11m10_256,*m10p,1);
m_b01 = _mm256_adds_epi16(a13,m11m10_256); //negative m10
m_b23 = _mm256_subs_epi16(a75,m11m10_256); //negative m10
m_b45 = _mm256_subs_epi16(a13,m11m10_256); //negative m10
m_b67 = _mm256_adds_epi16(a75,m11m10_256); //negative m10
new01 = _mm256_subs_epi16(a02,m11m10_256); //negative m10
new23 = _mm256_adds_epi16(a64,m11m10_256); //negative m10
new45 = _mm256_adds_epi16(a02,m11m10_256); //negative m10
new67 = _mm256_subs_epi16(a64,m11m10_256); //negative m10
a01 = _mm256_max_epi16(m_b01,new01);
a23 = _mm256_max_epi16(m_b23,new23);
a45 = _mm256_max_epi16(m_b45,new45);
a67 = _mm256_max_epi16(m_b67,new67);
alpha_max = _mm256_max_epi16(a01,a23);
alpha_max = _mm256_max_epi16(alpha_max,a45);
alpha_max = _mm256_max_epi16(alpha_max,a67);
alpha_max = _mm256_max_epi16(alpha_max,_mm256_permutevar8x32_epi32(alpha_max,_mm256_set_epi32(3,2,1,0,7,6,5,4)));
#endif
#elif defined(__arm__)
m_b0 = vqaddq_s16(alpha_ptr[1],*m11p); // m11
......@@ -411,7 +397,6 @@ void compute_alpha16(llr_t* alpha,llr_t* beta,llr_t* m_11,llr_t* m_10,unsigned s
m_b6 = vqsubq_s16(alpha_ptr[5],*m10p); // m01=-m10
m_b3 = vqsubq_s16(alpha_ptr[7],*m11p); // m00=-m11
m_b7 = vqaddq_s16(alpha_ptr[7],*m11p); // m11
new0 = vqsubq_s16(alpha_ptr[0],*m11p); // m00=-m11
new4 = vqaddq_s16(alpha_ptr[0],*m11p); // m11
new1 = vqaddq_s16(alpha_ptr[2],*m10p); // m10
......@@ -428,7 +413,6 @@ void compute_alpha16(llr_t* alpha,llr_t* beta,llr_t* m_11,llr_t* m_10,unsigned s
a5 = vmaxq_s16(m_b5,new5);
a6 = vmaxq_s16(m_b6,new6);
a7 = vmaxq_s16(m_b7,new7);
// compute and subtract maxima
alpha_max = vmaxq_s16(a0,a1);
alpha_max = vmaxq_s16(alpha_max,a2);
......@@ -437,9 +421,7 @@ void compute_alpha16(llr_t* alpha,llr_t* beta,llr_t* m_11,llr_t* m_10,unsigned s
alpha_max = vmaxq_s16(alpha_max,a5);
alpha_max = vmaxq_s16(alpha_max,a6);
alpha_max = vmaxq_s16(alpha_max,a7);
#endif
alpha_ptr+=8;
//#ifdef __AVX2__
m11p++;
......@@ -456,12 +438,10 @@ void compute_alpha16(llr_t* alpha,llr_t* beta,llr_t* m_11,llr_t* m_10,unsigned s
alpha_ptr[6] = _mm_subs_epi16(a6,alpha_max);
alpha_ptr[7] = _mm_subs_epi16(a7,alpha_max);
#else
a01 = _mm256_subs_epi16(a01,alpha_max);
a23 = _mm256_subs_epi16(a23,alpha_max);
a45 = _mm256_subs_epi16(a45,alpha_max);
a67 = _mm256_subs_epi16(a67,alpha_max);
alpha_ptr256[0] = _mm256_permute2x128_si256(a01,a23,0x20); //a02
alpha_ptr256[1] = _mm256_permute2x128_si256(a01,a23,0x13); //a13
alpha_ptr256[2] = _mm256_permute2x128_si256(a45,a67,0x02); //a64
......@@ -477,49 +457,44 @@ void compute_alpha16(llr_t* alpha,llr_t* beta,llr_t* m_11,llr_t* m_10,unsigned s
alpha_ptr[6] = vqsubq_s16(a6,alpha_max);
alpha_ptr[7] = vqsubq_s16(a7,alpha_max);
#endif
#ifdef DEBUG_LOGMAP
fprintf(fdsse4,"Loop index %d\n",k);
print_shorts("mb0",(int16_t*)&m_b0);
print_shorts("mb1",(int16_t*)&m_b1);
print_shorts("mb2",(int16_t*)&m_b2);
print_shorts("mb3",(int16_t*)&m_b3);
print_shorts("mb4",(int16_t*)&m_b4);
print_shorts("mb5",(int16_t*)&m_b5);
print_shorts("mb6",(int16_t*)&m_b6);
print_shorts("mb7",(int16_t*)&m_b7);
print_shorts("mb0",(int16_t *)&m_b0);
print_shorts("mb1",(int16_t *)&m_b1);
print_shorts("mb2",(int16_t *)&m_b2);
print_shorts("mb3",(int16_t *)&m_b3);
print_shorts("mb4",(int16_t *)&m_b4);
print_shorts("mb5",(int16_t *)&m_b5);
print_shorts("mb6",(int16_t *)&m_b6);
print_shorts("mb7",(int16_t *)&m_b7);
fprintf(fdsse4,"Loop index %d, new\n",k);
print_shorts("new0",(int16_t*)&new0);
print_shorts("new1",(int16_t*)&new1);
print_shorts("new2",(int16_t*)&new2);
print_shorts("new3",(int16_t*)&new3);
print_shorts("new4",(int16_t*)&new4);
print_shorts("new5",(int16_t*)&new5);
print_shorts("new6",(int16_t*)&new6);
print_shorts("new7",(int16_t*)&new7);
print_shorts("new0",(int16_t *)&new0);
print_shorts("new1",(int16_t *)&new1);
print_shorts("new2",(int16_t *)&new2);
print_shorts("new3",(int16_t *)&new3);
print_shorts("new4",(int16_t *)&new4);
print_shorts("new5",(int16_t *)&new5);
print_shorts("new6",(int16_t *)&new6);
print_shorts("new7",(int16_t *)&new7);
fprintf(fdsse4,"Loop index %d, after max\n",k);
print_shorts("a0",(int16_t*)&a0);
print_shorts("a1",(int16_t*)&a1);
print_shorts("a2",(int16_t*)&a2);
print_shorts("a3",(int16_t*)&a3);
print_shorts("a4",(int16_t*)&a4);
print_shorts("a5",(int16_t*)&a5);
print_shorts("a6",(int16_t*)&a6);
print_shorts("a7",(int16_t*)&a7);
print_shorts("a0",(int16_t *)&a0);
print_shorts("a1",(int16_t *)&a1);
print_shorts("a2",(int16_t *)&a2);
print_shorts("a3",(int16_t *)&a3);
print_shorts("a4",(int16_t *)&a4);
print_shorts("a5",(int16_t *)&a5);
print_shorts("a6",(int16_t *)&a6);
print_shorts("a7",(int16_t *)&a7);
fprintf(fdsse4,"Loop index %d\n",k);
print_shorts("a0",(int16_t*)&alpha_ptr[0]);
print_shorts("a1",(int16_t*)&alpha_ptr[1]);
print_shorts("a2",(int16_t*)&alpha_ptr[2]);
print_shorts("a3",(int16_t*)&alpha_ptr[3]);
print_shorts("a4",(int16_t*)&alpha_ptr[4]);
print_shorts("a5",(int16_t*)&alpha_ptr[5]);
print_shorts("a6",(int16_t*)&alpha_ptr[6]);
print_shorts("a7",(int16_t*)&alpha_ptr[7]);
print_shorts("a0",(int16_t *)&alpha_ptr[0]);
print_shorts("a1",(int16_t *)&alpha_ptr[1]);
print_shorts("a2",(int16_t *)&alpha_ptr[2]);
print_shorts("a3",(int16_t *)&alpha_ptr[3]);
print_shorts("a4",(int16_t *)&alpha_ptr[4]);
print_shorts("a5",(int16_t *)&alpha_ptr[5]);
print_shorts("a6",(int16_t *)&alpha_ptr[6]);
print_shorts("a7",(int16_t *)&alpha_ptr[7]);
#endif
}
if (rerun_flag==1)
......@@ -528,37 +503,28 @@ void compute_alpha16(llr_t* alpha,llr_t* beta,llr_t* m_11,llr_t* m_10,unsigned s
}
void compute_beta16(llr_t* alpha,llr_t* beta,llr_t *m_11,llr_t* m_10,unsigned short frame_length,unsigned char F,int offset8_flag)
{
void compute_beta16(llr_t *alpha,llr_t *beta,llr_t *m_11,llr_t *m_10,unsigned short frame_length,unsigned char F,int offset8_flag) {
int k,rerun_flag=0;
#if defined(__x86_64__) || defined(__i386__)
__m128i m11_128,m10_128;
__m128i m_b0,m_b1,m_b2,m_b3,m_b4,m_b5,m_b6,m_b7;
__m128i new0,new1,new2,new3,new4,new5,new6,new7;
__m128i *beta128,*alpha128,*beta_ptr;
__m128i beta_max;
#elif defined(__arm__)
int16x8_t m11_128,m10_128;
int16x8_t m_b0,m_b1,m_b2,m_b3,m_b4,m_b5,m_b6,m_b7;
int16x8_t new0,new1,new2,new3,new4,new5,new6,new7;
int16x8_t *beta128,*alpha128,*beta_ptr;
int16x8_t beta_max;
#endif
int16_t m11,m10,beta0_16,beta1_16,beta2_16,beta3_16,beta4_16,beta5_16,beta6_16,beta7_16,beta0_2,beta1_2,beta2_2,beta3_2,beta_m;
llr_t beta0,beta1;
#ifdef DEBUG_LOGMAP
fprintf(fdsse4,"compute_beta, %p,%p,%p,%p,framelength %d,F %d\n",
beta,m_11,m_10,alpha,frame_length,F);
#endif
// termination for beta initialization
// fprintf(fdsse4,"beta init: offset8 %d\n",offset8_flag);
m11=(int16_t)m_11[2+frame_length];
//#ifndef __AVX2__
......@@ -570,16 +536,13 @@ void compute_beta16(llr_t* alpha,llr_t* beta,llr_t *m_11,llr_t* m_10,unsigned sh
#ifdef DEBUG_LOGMAP
fprintf(fdsse4,"m11,m10 %d,%d\n",m11,m10);
#endif
beta0 = -m11;//M0T_TERM;
beta1 = m11;//M1T_TERM;
m11=(int16_t)m_11[1+frame_length];
m10=(int16_t)m_10[1+frame_length];
#ifdef DEBUG_LOGMAP
fprintf(fdsse4,"m11,m10 %d,%d\n",m11,m10);
#endif
beta0_2 = beta0-m11;//+M0T_TERM;
beta1_2 = beta0+m11;//+M1T_TERM;
beta2_2 = beta1+m10;//M2T_TERM;
......@@ -597,8 +560,6 @@ void compute_beta16(llr_t* alpha,llr_t* beta,llr_t *m_11,llr_t* m_10,unsigned sh
beta5_16 = beta2_2+m10;//+M5T_TERM;
beta6_16 = beta3_2+m11;//+M6T_TERM;
beta7_16 = beta3_2-m11;//+M7T_TERM;
beta_m = (beta0_16>beta1_16) ? beta0_16 : beta1_16;
beta_m = (beta_m>beta2_16) ? beta_m : beta2_16;
beta_m = (beta_m>beta3_16) ? beta_m : beta3_16;
......@@ -606,8 +567,6 @@ void compute_beta16(llr_t* alpha,llr_t* beta,llr_t *m_11,llr_t* m_10,unsigned sh
beta_m = (beta_m>beta5_16) ? beta_m : beta5_16;
beta_m = (beta_m>beta6_16) ? beta_m : beta6_16;
beta_m = (beta_m>beta7_16) ? beta_m : beta7_16;
beta0_16=beta0_16-beta_m;
beta1_16=beta1_16-beta_m;
beta2_16=beta2_16-beta_m;
......@@ -619,12 +578,13 @@ void compute_beta16(llr_t* alpha,llr_t* beta,llr_t *m_11,llr_t* m_10,unsigned sh
for (rerun_flag=0;; rerun_flag=1) {
#if defined(__x86_64__) || defined(__i386__)
beta_ptr = (__m128i*)&beta[frame_length<<3];
alpha128 = (__m128i*)&alpha[0];
beta_ptr = (__m128i *)&beta[frame_length<<3];
alpha128 = (__m128i *)&alpha[0];
#elif defined(__arm__)
beta_ptr = (int16x8_t*)&beta[frame_length<<3];
alpha128 = (int16x8_t*)&alpha[0];
beta_ptr = (int16x8_t *)&beta[frame_length<<3];
alpha128 = (int16x8_t *)&alpha[0];
#endif
if (rerun_flag == 0) {
beta_ptr[0] = alpha128[(frame_length)];
beta_ptr[1] = alpha128[1+(frame_length)];
......@@ -636,18 +596,18 @@ void compute_beta16(llr_t* alpha,llr_t* beta,llr_t *m_11,llr_t* m_10,unsigned sh
beta_ptr[7] = alpha128[7+(frame_length)];
#ifdef DEBUG_LOGMAP
fprintf(fdsse4,"beta init \n");
print_shorts("b0",(int16_t*)&beta_ptr[0]);
print_shorts("b1",(int16_t*)&beta_ptr[1]);
print_shorts("b2",(int16_t*)&beta_ptr[2]);
print_shorts("b3",(int16_t*)&beta_ptr[3]);
print_shorts("b4",(int16_t*)&beta_ptr[4]);
print_shorts("b5",(int16_t*)&beta_ptr[5]);
print_shorts("b6",(int16_t*)&beta_ptr[6]);
print_shorts("b7",(int16_t*)&beta_ptr[7]);
print_shorts("b0",(int16_t *)&beta_ptr[0]);
print_shorts("b1",(int16_t *)&beta_ptr[1]);
print_shorts("b2",(int16_t *)&beta_ptr[2]);
print_shorts("b3",(int16_t *)&beta_ptr[3]);
print_shorts("b4",(int16_t *)&beta_ptr[4]);
print_shorts("b5",(int16_t *)&beta_ptr[5]);
print_shorts("b6",(int16_t *)&beta_ptr[6]);
print_shorts("b7",(int16_t *)&beta_ptr[7]);
#endif
} else {
#if defined(__x86_64__) || defined(__i386__)
beta128 = (__m128i*)&beta[0];
beta128 = (__m128i *)&beta[0];
beta_ptr[0] = _mm_srli_si128(beta128[0],2);
beta_ptr[1] = _mm_srli_si128(beta128[1],2);
beta_ptr[2] = _mm_srli_si128(beta128[2],2);
......@@ -657,31 +617,38 @@ void compute_beta16(llr_t* alpha,llr_t* beta,llr_t *m_11,llr_t* m_10,unsigned sh
beta_ptr[6] = _mm_srli_si128(beta128[6],2);
beta_ptr[7] = _mm_srli_si128(beta128[7],2);
#elif defined(__arm__)
beta128 = (int16x8_t*)&beta[0];
beta_ptr = (int16x8_t*)&beta[frame_length<<3];
beta_ptr[0] = (int16x8_t)vshrq_n_s64((int64x2_t)beta128[0],16); beta_ptr[0] = vsetq_lane_s16(beta[3],beta_ptr[0],4);
beta_ptr[1] = (int16x8_t)vshrq_n_s64((int64x2_t)beta128[1],16); beta_ptr[1] = vsetq_lane_s16(beta[11],beta_ptr[1],4);
beta_ptr[2] = (int16x8_t)vshrq_n_s64((int64x2_t)beta128[2],16); beta_ptr[2] = vsetq_lane_s16(beta[19],beta_ptr[2],4);
beta_ptr[3] = (int16x8_t)vshrq_n_s64((int64x2_t)beta128[3],16); beta_ptr[3] = vsetq_lane_s16(beta[27],beta_ptr[3],4);
beta_ptr[4] = (int16x8_t)vshrq_n_s64((int64x2_t)beta128[4],16); beta_ptr[4] = vsetq_lane_s16(beta[35],beta_ptr[4],4);
beta_ptr[5] = (int16x8_t)vshrq_n_s64((int64x2_t)beta128[5],16); beta_ptr[5] = vsetq_lane_s16(beta[43],beta_ptr[5],4);
beta_ptr[6] = (int16x8_t)vshrq_n_s64((int64x2_t)beta128[6],16); beta_ptr[6] = vsetq_lane_s16(beta[51],beta_ptr[6],4);
beta_ptr[7] = (int16x8_t)vshrq_n_s64((int64x2_t)beta128[7],16); beta_ptr[7] = vsetq_lane_s16(beta[59],beta_ptr[7],4);
beta128 = (int16x8_t *)&beta[0];
beta_ptr = (int16x8_t *)&beta[frame_length<<3];
beta_ptr[0] = (int16x8_t)vshrq_n_s64((int64x2_t)beta128[0],16);
beta_ptr[0] = vsetq_lane_s16(beta[3],beta_ptr[0],4);
beta_ptr[1] = (int16x8_t)vshrq_n_s64((int64x2_t)beta128[1],16);
beta_ptr[1] = vsetq_lane_s16(beta[11],beta_ptr[1],4);
beta_ptr[2] = (int16x8_t)vshrq_n_s64((int64x2_t)beta128[2],16);
beta_ptr[2] = vsetq_lane_s16(beta[19],beta_ptr[2],4);
beta_ptr[3] = (int16x8_t)vshrq_n_s64((int64x2_t)beta128[3],16);
beta_ptr[3] = vsetq_lane_s16(beta[27],beta_ptr[3],4);
beta_ptr[4] = (int16x8_t)vshrq_n_s64((int64x2_t)beta128[4],16);
beta_ptr[4] = vsetq_lane_s16(beta[35],beta_ptr[4],4);
beta_ptr[5] = (int16x8_t)vshrq_n_s64((int64x2_t)beta128[5],16);
beta_ptr[5] = vsetq_lane_s16(beta[43],beta_ptr[5],4);
beta_ptr[6] = (int16x8_t)vshrq_n_s64((int64x2_t)beta128[6],16);
beta_ptr[6] = vsetq_lane_s16(beta[51],beta_ptr[6],4);
beta_ptr[7] = (int16x8_t)vshrq_n_s64((int64x2_t)beta128[7],16);
beta_ptr[7] = vsetq_lane_s16(beta[59],beta_ptr[7],4);
#endif
#ifdef DEBUG_LOGMAP
fprintf(fdsse4,"beta init (second run) \n");
print_shorts("b0",(int16_t*)&beta_ptr[0]);
print_shorts("b1",(int16_t*)&beta_ptr[1]);
print_shorts("b2",(int16_t*)&beta_ptr[2]);
print_shorts("b3",(int16_t*)&beta_ptr[3]);
print_shorts("b4",(int16_t*)&beta_ptr[4]);
print_shorts("b5",(int16_t*)&beta_ptr[5]);
print_shorts("b6",(int16_t*)&beta_ptr[6]);
print_shorts("b7",(int16_t*)&beta_ptr[7]);
print_shorts("b0",(int16_t *)&beta_ptr[0]);
print_shorts("b1",(int16_t *)&beta_ptr[1]);
print_shorts("b2",(int16_t *)&beta_ptr[2]);
print_shorts("b3",(int16_t *)&beta_ptr[3]);
print_shorts("b4",(int16_t *)&beta_ptr[4]);
print_shorts("b5",(int16_t *)&beta_ptr[5]);
print_shorts("b6",(int16_t *)&beta_ptr[6]);
print_shorts("b7",(int16_t *)&beta_ptr[7]);
#endif
}
#if defined(__x86_64__) || defined(__i386__)
beta_ptr[0] = _mm_insert_epi16(beta_ptr[0],beta0_16,7);
beta_ptr[1] = _mm_insert_epi16(beta_ptr[1],beta1_16,7);
......@@ -701,26 +668,23 @@ void compute_beta16(llr_t* alpha,llr_t* beta,llr_t *m_11,llr_t* m_10,unsigned sh
beta_ptr[6] = vsetq_lane_s16(beta6_16,beta_ptr[6],7);
beta_ptr[7] = vsetq_lane_s16(beta7_16,beta_ptr[7],7);
#endif
#ifdef DEBUG_LOGMAP
fprintf(fdsse4,"beta init (after insert) \n");
print_shorts("b0",(int16_t*)&beta_ptr[0]);
print_shorts("b1",(int16_t*)&beta_ptr[1]);
print_shorts("b2",(int16_t*)&beta_ptr[2]);
print_shorts("b3",(int16_t*)&beta_ptr[3]);
print_shorts("b4",(int16_t*)&beta_ptr[4]);
print_shorts("b5",(int16_t*)&beta_ptr[5]);
print_shorts("b6",(int16_t*)&beta_ptr[6]);
print_shorts("b7",(int16_t*)&beta_ptr[7]);
print_shorts("b0",(int16_t *)&beta_ptr[0]);
print_shorts("b1",(int16_t *)&beta_ptr[1]);
print_shorts("b2",(int16_t *)&beta_ptr[2]);
print_shorts("b3",(int16_t *)&beta_ptr[3]);
print_shorts("b4",(int16_t *)&beta_ptr[4]);
print_shorts("b5",(int16_t *)&beta_ptr[5]);
print_shorts("b6",(int16_t *)&beta_ptr[6]);
print_shorts("b7",(int16_t *)&beta_ptr[7]);
#endif
int loopval=((rerun_flag==0)?0:((frame_length-L)>>3));
for (k=(frame_length>>3)-1; k>=loopval; k--) {
#if defined(__x86_64__) || defined(__i386__)
m11_128=((__m128i*)m_11)[k];
m10_128=((__m128i*)m_10)[k];
m11_128=((__m128i *)m_11)[k];
m10_128=((__m128i *)m_10)[k];
//#ifndef __AVX2__
#if 1
m_b0 = _mm_adds_epi16(beta_ptr[4],m11_128); //m11
......@@ -731,8 +695,6 @@ void compute_beta16(llr_t* alpha,llr_t* beta,llr_t *m_11,llr_t* m_10,unsigned sh
m_b5 = _mm_subs_epi16(beta_ptr[6],m10_128); //m01
m_b6 = _mm_subs_epi16(beta_ptr[7],m11_128); //m00
m_b7 = _mm_adds_epi16(beta_ptr[7],m11_128); //m11
new0 = _mm_subs_epi16(beta_ptr[0],m11_128); //m00
new1 = _mm_adds_epi16(beta_ptr[0],m11_128); //m11
new2 = _mm_adds_epi16(beta_ptr[1],m10_128); //m10
......@@ -741,16 +703,13 @@ void compute_beta16(llr_t* alpha,llr_t* beta,llr_t *m_11,llr_t* m_10,unsigned sh
new5 = _mm_adds_epi16(beta_ptr[2],m10_128); //m10
new6 = _mm_adds_epi16(beta_ptr[3],m11_128); //m11
new7 = _mm_subs_epi16(beta_ptr[3],m11_128); //m00
#else
b01=_mm256_load_si256(&((_m256i*)beta_ptr)[0]);
b23=_mm256_load_si256(&((_m256i*)beta_ptr)[1]);
b45=_mm256_load_si256(&((_m256i*)beta_ptr)[2]);
b67=_mm256_load_si256(&((_m256i*)beta_ptr)[3]);
b01=_mm256_load_si256(&((_m256i *)beta_ptr)[0]);
b23=_mm256_load_si256(&((_m256i *)beta_ptr)[1]);
b45=_mm256_load_si256(&((_m256i *)beta_ptr)[2]);
b67=_mm256_load_si256(&((_m256i *)beta_ptr)[3]);
m11m10_256 = _mm256_insertf128_si256(m11m10_256,m11_128,0);
m11m10_256 = _mm256_insertf128_si256(m11m10_256,m10_128,1);
m_b02 = _mm256_adds_epi16(b45,m11m10_256); //negative m10
m_b13 = _mm256_subs_epi16(b45,m11m10_256); //negative m10
m_b64 = _mm256_subs_epi16(b67,m11m10_256); //negative m10
......@@ -760,9 +719,7 @@ void compute_beta16(llr_t* alpha,llr_t* beta,llr_t *m_11,llr_t* m_10,unsigned sh
new64 = _mm256_adds_epi16(b23,m11m10_256); //negative m10
new75 = _mm256_subs_epi16(b24,m11m10_256); //negative m10
#endif
beta_ptr-=8;
//#ifndef __AVX2__
#if 1
beta_ptr[0] = _mm_max_epi16(m_b0,new0);
......@@ -773,7 +730,6 @@ void compute_beta16(llr_t* alpha,llr_t* beta,llr_t *m_11,llr_t* m_10,unsigned sh
beta_ptr[5] = _mm_max_epi16(m_b5,new5);
beta_ptr[6] = _mm_max_epi16(m_b6,new6);
beta_ptr[7] = _mm_max_epi16(m_b7,new7);
beta_max = _mm_max_epi16(beta_ptr[0],beta_ptr[1]);
beta_max = _mm_max_epi16(beta_max ,beta_ptr[2]);
beta_max = _mm_max_epi16(beta_max ,beta_ptr[3]);
......@@ -781,7 +737,6 @@ void compute_beta16(llr_t* alpha,llr_t* beta,llr_t *m_11,llr_t* m_10,unsigned sh
beta_max = _mm_max_epi16(beta_max ,beta_ptr[5]);
beta_max = _mm_max_epi16(beta_max ,beta_ptr[6]);
beta_max = _mm_max_epi16(beta_max ,beta_ptr[7]);
beta_ptr[0] = _mm_subs_epi16(beta_ptr[0],beta_max);
beta_ptr[1] = _mm_subs_epi16(beta_ptr[1],beta_max);
beta_ptr[2] = _mm_subs_epi16(beta_ptr[2],beta_max);
......@@ -795,26 +750,22 @@ void compute_beta16(llr_t* alpha,llr_t* beta,llr_t *m_11,llr_t* m_10,unsigned sh
b13 = _mm256_max_epi16(m_b13,new13);
b64 = _mm256_max_epi16(m_b64,new64);
b75 = _mm256_max_epi16(m_b75,new75);
beta_max = _mm256_max_epi16(b02,b13);
beta_max = _mm256_max_epi16(beta_max,b64);
beta_max = _mm256_max_epi16(beta_max,b75);
beta_max = _mm256_max_epi16(beta_max,_mm256_permutevar8x32_epi32(betaa_max,_mm256_set_epi32(3,2,1,0,7,6,5,4)));
b02 = _mm256_subs_epi16(b02,beta_max);
b13 = _mm256_subs_epi16(b13,beta_max);
b64 = _mm256_subs_epi16(b64,beta_max);
b75 = _mm256_subs_epi16(b75,beta_max);
((_m256i*)beta_ptr)[0]) = _mm256_permute2x128_si256(b02,b13,0x02); //b01
((_m256i*)beta_ptr)[1]) = _mm256_permute2x128_si256(b02,b13,0x31); //b23
((_m256i*)beta_ptr)[2]) = _mm256_permute2x128_si256(b64,b75,0x13); //b45
((_m256i*)beta_ptr)[3]) = _mm256_permute2x128_si256(b64,b75,0x20); //b67
((_m256i *)beta_ptr)[0]) = _mm256_permute2x128_si256(b02,b13,0x02); //b01
((_m256i *)beta_ptr)[1]) = _mm256_permute2x128_si256(b02,b13,0x31); //b23
((_m256i *)beta_ptr)[2]) = _mm256_permute2x128_si256(b64,b75,0x13); //b45
((_m256i *)beta_ptr)[3]) = _mm256_permute2x128_si256(b64,b75,0x20); //b67
#endif
#elif defined(__arm__)
m11_128=((int16x8_t*)m_11)[k];
m10_128=((int16x8_t*)m_10)[k];
m11_128=((int16x8_t *)m_11)[k];
m10_128=((int16x8_t *)m_10)[k];
m_b0 = vqaddq_s16(beta_ptr[4],m11_128); //m11
m_b1 = vqsubq_s16(beta_ptr[4],m11_128); //m00
m_b2 = vqsubq_s16(beta_ptr[5],m10_128); //m01
......@@ -823,7 +774,6 @@ void compute_beta16(llr_t* alpha,llr_t* beta,llr_t *m_11,llr_t* m_10,unsigned sh
m_b5 = vqsubq_s16(beta_ptr[6],m10_128); //m01
m_b6 = vqsubq_s16(beta_ptr[7],m11_128); //m00
m_b7 = vqaddq_s16(beta_ptr[7],m11_128); //m11
new0 = vqsubq_s16(beta_ptr[0],m11_128); //m00
new1 = vqaddq_s16(beta_ptr[0],m11_128); //m11
new2 = vqaddq_s16(beta_ptr[1],m10_128); //m10
......@@ -832,9 +782,7 @@ void compute_beta16(llr_t* alpha,llr_t* beta,llr_t *m_11,llr_t* m_10,unsigned sh
new5 = vqaddq_s16(beta_ptr[2],m10_128); //m10
new6 = vqaddq_s16(beta_ptr[3],m11_128); //m11
new7 = vqsubq_s16(beta_ptr[3],m11_128); //m00
beta_ptr-=8;
beta_ptr[0] = vmaxq_s16(m_b0,new0);
beta_ptr[1] = vmaxq_s16(m_b1,new1);
beta_ptr[2] = vmaxq_s16(m_b2,new2);
......@@ -843,7 +791,6 @@ void compute_beta16(llr_t* alpha,llr_t* beta,llr_t *m_11,llr_t* m_10,unsigned sh
beta_ptr[5] = vmaxq_s16(m_b5,new5);
beta_ptr[6] = vmaxq_s16(m_b6,new6);
beta_ptr[7] = vmaxq_s16(m_b7,new7);
beta_max = vmaxq_s16(beta_ptr[0],beta_ptr[1]);
beta_max = vmaxq_s16(beta_max ,beta_ptr[2]);
beta_max = vmaxq_s16(beta_max ,beta_ptr[3]);
......@@ -851,7 +798,6 @@ void compute_beta16(llr_t* alpha,llr_t* beta,llr_t *m_11,llr_t* m_10,unsigned sh
beta_max = vmaxq_s16(beta_max ,beta_ptr[5]);
beta_max = vmaxq_s16(beta_max ,beta_ptr[6]);
beta_max = vmaxq_s16(beta_max ,beta_ptr[7]);
beta_ptr[0] = vqsubq_s16(beta_ptr[0],beta_max);
beta_ptr[1] = vqsubq_s16(beta_ptr[1],beta_max);
beta_ptr[2] = vqsubq_s16(beta_ptr[2],beta_max);
......@@ -861,20 +807,18 @@ void compute_beta16(llr_t* alpha,llr_t* beta,llr_t *m_11,llr_t* m_10,unsigned sh
beta_ptr[6] = vqsubq_s16(beta_ptr[6],beta_max);
beta_ptr[7] = vqsubq_s16(beta_ptr[7],beta_max);
#endif
#ifdef DEBUG_LOGMAP
fprintf(fdsse4,"Loop index %d, mb\n",k);
fprintf(fdsse4,"beta init (after max)\n");
print_shorts("b0",(int16_t*)&beta_ptr[0]);
print_shorts("b1",(int16_t*)&beta_ptr[1]);
print_shorts("b2",(int16_t*)&beta_ptr[2]);
print_shorts("b3",(int16_t*)&beta_ptr[3]);
print_shorts("b4",(int16_t*)&beta_ptr[4]);
print_shorts("b5",(int16_t*)&beta_ptr[5]);
print_shorts("b6",(int16_t*)&beta_ptr[6]);
print_shorts("b7",(int16_t*)&beta_ptr[7]);
print_shorts("b0",(int16_t *)&beta_ptr[0]);
print_shorts("b1",(int16_t *)&beta_ptr[1]);
print_shorts("b2",(int16_t *)&beta_ptr[2]);
print_shorts("b3",(int16_t *)&beta_ptr[3]);
print_shorts("b4",(int16_t *)&beta_ptr[4]);
print_shorts("b5",(int16_t *)&beta_ptr[5]);
print_shorts("b6",(int16_t *)&beta_ptr[6]);
print_shorts("b7",(int16_t *)&beta_ptr[7]);
#endif
}
if (rerun_flag==1)
......@@ -882,8 +826,7 @@ void compute_beta16(llr_t* alpha,llr_t* beta,llr_t *m_11,llr_t* m_10,unsigned sh
}
}
void compute_ext16(llr_t* alpha,llr_t* beta,llr_t* m_11,llr_t* m_10,llr_t* ext, llr_t* systematic,unsigned short frame_length)
{
void compute_ext16(llr_t *alpha,llr_t *beta,llr_t *m_11,llr_t *m_10,llr_t *ext, llr_t *systematic,unsigned short frame_length) {
#if defined(__x86_64__) || defined(__i386__)
__m128i *alpha128=(__m128i *)alpha;
__m128i *beta128=(__m128i *)beta;
......@@ -903,28 +846,21 @@ void compute_ext16(llr_t* alpha,llr_t* beta,llr_t* m_11,llr_t* m_10,llr_t* ext,
int16x8_t m10_1,m10_2,m10_3,m10_4;
int16x8_t m11_1,m11_2,m11_3,m11_4;
#endif
int k;
//
// LLR computation, 8 consequtive bits per loop
//
#ifdef DEBUG_LOGMAP
fprintf(fdsse4,"compute_ext (sse_16bit), %p, %p, %p, %p, %p, %p ,framelength %d\n",alpha,beta,m_11,m_10,ext,systematic,frame_length);
#endif
alpha_ptr = alpha128;
beta_ptr = &beta128[8];
for (k=0; k<(frame_length>>3); k++) {
#if defined(__x86_64__) || defined(__i386__)
m11_128 = (__m128i*)&m_11[k<<3];
m10_128 = (__m128i*)&m_10[k<<3];
ext_128 = (__m128i*)&ext[k<<3];
m11_128 = (__m128i *)&m_11[k<<3];
m10_128 = (__m128i *)&m_10[k<<3];
ext_128 = (__m128i *)&ext[k<<3];
/*
fprintf(fdsse4,"EXT %03d\n",k);
print_shorts("a0:",&alpha_ptr[0]);
......@@ -944,7 +880,6 @@ void compute_ext16(llr_t* alpha,llr_t* beta,llr_t* m_11,llr_t* m_10,llr_t* ext,
print_shorts("b6:",&beta_ptr[6]);
print_shorts("b7:",&beta_ptr[7]);
*/
//#ifndef __AVX2__
#if 1
m00_4 = _mm_adds_epi16(alpha_ptr[7],beta_ptr[3]); //ALPHA_BETA_4m00;
......@@ -964,31 +899,23 @@ void compute_ext16(llr_t* alpha,llr_t* beta,llr_t* m_11,llr_t* m_10,llr_t* ext,
m10_1 = _mm_adds_epi16(alpha_ptr[2],beta_ptr[1]); //ALPHA_BETA_1m10;
m01_1 = _mm_adds_epi16(alpha_ptr[2],beta_ptr[5]); //ALPHA_BETA_1m01;
#else
m00_1 = _mm_adds_epi16(alpha_ptr[0],beta_ptr[0]); //ALPHA_BETA_1m00;
m10_1 = _mm_adds_epi16(alpha_ptr[2],beta_ptr[1]); //ALPHA_BETA_1m10;
m11_1 = _mm_adds_epi16(alpha_ptr[0],beta_ptr[4]); //ALPHA_BETA_1m11;
m01_1 = _mm_adds_epi16(alpha_ptr[2],beta_ptr[5]); //ALPHA_BETA_1m01;
m11_2 = _mm_adds_epi16(alpha_ptr[1],beta_ptr[0]); //ALPHA_BETA_2m11;
m01_2 = _mm_adds_epi16(alpha_ptr[3],beta_ptr[1]); //ALPHA_BETA_2m01;
m00_2 = _mm_adds_epi16(alpha_ptr[1],beta_ptr[4]); //ALPHA_BETA_2m00;
m10_2 = _mm_adds_epi16(alpha_ptr[3],beta_ptr[5]); //ALPHA_BETA_2m10;
m11_3 = _mm_adds_epi16(alpha_ptr[6],beta_ptr[3]); //ALPHA_BETA_3m11;
m01_3 = _mm_adds_epi16(alpha_ptr[4],beta_ptr[2]); //ALPHA_BETA_3m01;
m00_3 = _mm_adds_epi16(alpha_ptr[6],beta_ptr[7]); //ALPHA_BETA_3m00;
m10_3 = _mm_adds_epi16(alpha_ptr[4],beta_ptr[6]); //ALPHA_BETA_3m10;
m00_4 = _mm_adds_epi16(alpha_ptr[7],beta_ptr[3]); //ALPHA_BETA_4m00;
m10_4 = _mm_adds_epi16(alpha_ptr[5],beta_ptr[2]); //ALPHA_BETA_4m10;
m11_4 = _mm_adds_epi16(alpha_ptr[7],beta_ptr[7]); //ALPHA_BETA_4m11;
m01_4 = _mm_adds_epi16(alpha_ptr[5],beta_ptr[6]); //ALPHA_BETA_4m01;
#endif
/*
print_shorts("m11_1:",&m11_1);
print_shorts("m11_2:",&m11_2);
......@@ -1019,36 +946,30 @@ void compute_ext16(llr_t* alpha,llr_t* beta,llr_t* m_11,llr_t* m_10,llr_t* ext,
m11_1 = _mm_max_epi16(m11_1,m11_2);
m11_1 = _mm_max_epi16(m11_1,m11_3);
m11_1 = _mm_max_epi16(m11_1,m11_4);
// print_shorts("m11_1:",&m11_1);
m01_1 = _mm_subs_epi16(m01_1,*m10_128);
m00_1 = _mm_subs_epi16(m00_1,*m11_128);
m10_1 = _mm_adds_epi16(m10_1,*m10_128);
m11_1 = _mm_adds_epi16(m11_1,*m11_128);
// print_shorts("m10_1:",&m10_1);
// print_shorts("m11_1:",&m11_1);
m01_1 = _mm_max_epi16(m01_1,m00_1);
m10_1 = _mm_max_epi16(m10_1,m11_1);
// print_shorts("m01_1:",&m01_1);
// print_shorts("m10_1:",&m10_1);
*ext_128 = _mm_subs_epi16(m10_1,m01_1);
#ifdef DEBUG_LOGMAP
fprintf(fdsse4,"ext %p\n",ext_128);
print_shorts("ext:",(int16_t*)ext_128);
print_shorts("m11:",(int16_t*)m11_128);
print_shorts("m10:",(int16_t*)m10_128);
print_shorts("m10_1:",(int16_t*)&m10_1);
print_shorts("m01_1:",(int16_t*)&m01_1);
print_shorts("ext:",(int16_t *)ext_128);
print_shorts("m11:",(int16_t *)m11_128);
print_shorts("m10:",(int16_t *)m10_128);
print_shorts("m10_1:",(int16_t *)&m10_1);
print_shorts("m01_1:",(int16_t *)&m01_1);
#endif
#elif defined(__arm__)
m11_128 = (int16x8_t*)&m_11[k<<3];
m10_128 = (int16x8_t*)&m_10[k<<3];
ext_128 = (int16x8_t*)&ext[k<<3];
m11_128 = (int16x8_t *)&m_11[k<<3];
m10_128 = (int16x8_t *)&m_10[k<<3];
ext_128 = (int16x8_t *)&ext[k<<3];
m00_4 = vqaddq_s16(alpha_ptr[7],beta_ptr[3]); //ALPHA_BETA_4m00;
m11_4 = vqaddq_s16(alpha_ptr[7],beta_ptr[7]); //ALPHA_BETA_4m11;
m00_3 = vqaddq_s16(alpha_ptr[6],beta_ptr[7]); //ALPHA_BETA_3m00;
......@@ -1065,7 +986,6 @@ void compute_ext16(llr_t* alpha,llr_t* beta,llr_t* m_11,llr_t* m_10,llr_t* ext,
m10_2 = vqaddq_s16(alpha_ptr[3],beta_ptr[5]); //ALPHA_BETA_2m10;
m10_1 = vqaddq_s16(alpha_ptr[2],beta_ptr[1]); //ALPHA_BETA_1m10;
m01_1 = vqaddq_s16(alpha_ptr[2],beta_ptr[5]); //ALPHA_BETA_1m01;
m01_1 = vmaxq_s16(m01_1,m01_2);
m01_1 = vmaxq_s16(m01_1,m01_3);
m01_1 = vmaxq_s16(m01_1,m01_4);
......@@ -1078,18 +998,12 @@ void compute_ext16(llr_t* alpha,llr_t* beta,llr_t* m_11,llr_t* m_10,llr_t* ext,
m11_1 = vmaxq_s16(m11_1,m11_2);
m11_1 = vmaxq_s16(m11_1,m11_3);
m11_1 = vmaxq_s16(m11_1,m11_4);
m01_1 = vqsubq_s16(m01_1,*m10_128);
m00_1 = vqsubq_s16(m00_1,*m11_128);
m10_1 = vqaddq_s16(m10_1,*m10_128);
m11_1 = vqaddq_s16(m11_1,*m11_128);
m01_1 = vmaxq_s16(m01_1,m00_1);
m10_1 = vmaxq_s16(m10_1,m11_1);
*ext_128 = vqsubq_s16(m10_1,m01_1);
#endif
alpha_ptr+=8;
......@@ -1102,8 +1016,7 @@ void compute_ext16(llr_t* alpha,llr_t* beta,llr_t* m_11,llr_t* m_10,llr_t* ext,
//int pi2[n],pi3[n+8],pi5[n+8],pi4[n+8],pi6[n+8],
int *pi2tab16[188],*pi5tab16[188],*pi4tab16[188],*pi6tab16[188];
void free_td16(void)
{
void free_td16(void) {
int ind;
for (ind=0; ind<188; ind++) {
......@@ -1114,14 +1027,11 @@ void free_td16(void)
}
}
void init_td16(void)
{
void init_td16(void) {
int ind,i,i2,i3,j,n,pi,pi3;
short * base_interleaver;
short *base_interleaver;
for (ind=0; ind<188; ind++) {
n = f1f2mat[ind].nb_bits;
base_interleaver=il_tb+f1f2mat[ind].beg_index;
#ifdef MEX
......@@ -1141,10 +1051,8 @@ void init_td16(void)
j=i2;
for (i3=0; i3<(n>>3); i3++,i++,j+=8) {
// if (j>=n)
// j-=(n-1);
pi2tab16[ind][i] = j;
// fprintf(fdsse4,"pi2[%d] = %d\n",i,j);
}
......@@ -1157,7 +1065,6 @@ void init_td16(void)
pi5tab16[ind][pi3] = pi2tab16[ind][i];
pi6tab16[ind][pi] = pi2tab16[ind][i];
}
}
}
......@@ -1176,52 +1083,41 @@ uint8_t phy_threegpplte_turbo_decoder16(int16_t *y,
time_stats_t *ext_stats,
time_stats_t *intl1_stats,
time_stats_t *intl2_stats) {
/* y is a pointer to the input
decoded_bytes is a pointer to the decoded output
n is the size in bits of the coded block, with the tail */
llr_t systematic0[n+16] __attribute__ ((aligned(32)));
llr_t systematic1[n+16] __attribute__ ((aligned(32)));
llr_t systematic2[n+16] __attribute__ ((aligned(32)));
llr_t yparity1[n+16] __attribute__ ((aligned(32)));
llr_t yparity2[n+16] __attribute__ ((aligned(32)));
llr_t ext[n+128] __attribute__((aligned(32)));
llr_t ext2[n+128] __attribute__((aligned(32)));
llr_t alpha[(n+16)*8] __attribute__ ((aligned(32)));
llr_t beta[(n+16)*8] __attribute__ ((aligned(32)));
llr_t m11[n+32] __attribute__ ((aligned(32)));
llr_t m10[n+32] __attribute__ ((aligned(32)));
int *pi2_p,*pi4_p,*pi5_p,*pi6_p;
llr_t *s,*s1,*s2,*yp1,*yp2,*yp;
unsigned int i,j,iind;//,pi;
unsigned char iteration_cnt=0;
unsigned int crc,oldcrc,crc_len;
uint8_t temp;
#if defined(__x86_64__) || defined(__i386__)
__m128i *yp128;
__m128i tmp, zeros=_mm_setzero_si128();
__m128i tmpe;
#elif defined(__arm__)
int16x8_t *yp128;
// int16x8_t tmp128[(n+8)>>3];
// int16x8_t tmp128[(n+8)>>3];
int16x8_t tmp, zeros=vdupq_n_s16(0);
const uint16_t __attribute__ ((aligned (16))) _Powers[8]=
{ 1, 2, 4, 8, 16, 32, 64, 128};
uint16x8_t Powers= vld1q_u16(_Powers);
#endif
int offset8_flag=0;
#ifdef DEBUG_LOGMAP
fdsse4 = fopen("dump_sse4.txt","w");
printf("tc sse4_16 (y) %p\n",y);
#endif
......@@ -1230,12 +1126,8 @@ uint8_t phy_threegpplte_turbo_decoder16(int16_t *y,
return 255;
}
start_meas(init_stats);
for (iind=0; iind < 188 && f1f2mat[iind].nb_bits != n; iind++);
if ( iind == 188 ) {
......@@ -1262,32 +1154,23 @@ uint8_t phy_threegpplte_turbo_decoder16(int16_t *y,
}
#if defined(__x86_64__) || defined(__i386__)
yp128 = (__m128i*)y;
yp128 = (__m128i *)y;
#elif defined(__arm__)
yp128 = (int16x8_t*)y;
yp128 = (int16x8_t *)y;
#endif
s = systematic0;
s1 = systematic1;
s2 = systematic2;
yp1 = yparity1;
yp2 = yparity2;
for (i=0; i<n; i+=8) {
pi2_p = &pi2tab16[iind][i];
j=pi2_p[0];
#if defined(__x86_64__) || defined(__i386__)
tmpe = _mm_load_si128(yp128);
// fprintf(fdsse4,"yp128 %p\n",yp128);
// print_shorts("tmpe",(int16_t *)&tmpe);
s[j] = _mm_extract_epi16(tmpe,0);
yp1[j] = _mm_extract_epi16(tmpe,1);
yp2[j] = _mm_extract_epi16(tmpe,2);
......@@ -1295,7 +1178,6 @@ uint8_t phy_threegpplte_turbo_decoder16(int16_t *y,
fprintf(fdsse4,"init0: j %d, s[j] %d yp1[j] %d yp2[j] %d\n",j,s[j],yp1[j],yp2[j]);
#endif
j=pi2_p[1];
s[j] = _mm_extract_epi16(tmpe,3);
yp1[j] = _mm_extract_epi16(tmpe,4);
yp2[j] = _mm_extract_epi16(tmpe,5);
......@@ -1303,7 +1185,6 @@ uint8_t phy_threegpplte_turbo_decoder16(int16_t *y,
fprintf(fdsse4,"init1: j %d, s[j] %d yp1[j] %d yp2[j] %d\n",j,s[j],yp1[j],yp2[j]);
#endif
j=pi2_p[2];
s[j] = _mm_extract_epi16(tmpe,6);
yp1[j] = _mm_extract_epi16(tmpe,7);
tmpe = _mm_load_si128(&yp128[1]);
......@@ -1312,7 +1193,6 @@ uint8_t phy_threegpplte_turbo_decoder16(int16_t *y,
fprintf(fdsse4,"init2: j %d, s[j] %d yp1[j] %d yp2[j] %d\n",j,s[j],yp1[j],yp2[j]);
#endif
j=pi2_p[3];
s[j] = _mm_extract_epi16(tmpe,1);
yp1[j] = _mm_extract_epi16(tmpe,2);
yp2[j] = _mm_extract_epi16(tmpe,3);
......@@ -1320,7 +1200,6 @@ uint8_t phy_threegpplte_turbo_decoder16(int16_t *y,
fprintf(fdsse4,"init3: j %d, s[j] %d yp1[j] %d yp2[j] %d\n",j,s[j],yp1[j],yp2[j]);
#endif
j=pi2_p[4];
s[j] = _mm_extract_epi16(tmpe,4);
yp1[j] = _mm_extract_epi16(tmpe,5);
yp2[j] = _mm_extract_epi16(tmpe,6);
......@@ -1328,7 +1207,6 @@ uint8_t phy_threegpplte_turbo_decoder16(int16_t *y,
fprintf(fdsse4,"init4: j %d, s[j] %d yp1[j] %d yp2[j] %d\n",j,s[j],yp1[j],yp2[j]);
#endif
j=pi2_p[5];
s[j] = _mm_extract_epi16(tmpe,7);
tmpe = _mm_load_si128(&yp128[2]);
yp1[j] = _mm_extract_epi16(tmpe,0);
......@@ -1336,9 +1214,7 @@ uint8_t phy_threegpplte_turbo_decoder16(int16_t *y,
#ifdef DEBUG_LOGMAP
fprintf(fdsse4,"init5: j %d, s[j] %d yp1[j] %d yp2[j] %d\n",j,s[j],yp1[j],yp2[j]);
#endif
j=pi2_p[6];
s[j] = _mm_extract_epi16(tmpe,2);
yp1[j] = _mm_extract_epi16(tmpe,3);
yp2[j] = _mm_extract_epi16(tmpe,4);
......@@ -1346,60 +1222,49 @@ uint8_t phy_threegpplte_turbo_decoder16(int16_t *y,
fprintf(fdsse4,"init6: j %d, s[j] %d yp1[j] %d yp2[j] %d\n",j,s[j],yp1[j],yp2[j]);
#endif
j=pi2_p[7];
s[j] = _mm_extract_epi16(tmpe,5);
yp1[j] = _mm_extract_epi16(tmpe,6);
yp2[j] = _mm_extract_epi16(tmpe,7);
#ifdef DEBUG_LOGMAP
fprintf(fdsse4,"init7: j %d, s[j] %d yp1[j] %d yp2[j] %d\n",j,s[j],yp1[j],yp2[j]);
#endif
#elif defined(__arm__)
s[j] = vgetq_lane_s16(yp128[0],0);
yp1[j] = vgetq_lane_s16(yp128[0],1);
yp2[j] = vgetq_lane_s16(yp128[0],2);
j=pi2_p[1];
s[j] = vgetq_lane_s16(yp128[0],3);
yp1[j] = vgetq_lane_s16(yp128[0],4);
yp2[j] = vgetq_lane_s16(yp128[0],5);
j=pi2_p[2];
s[j] = vgetq_lane_s16(yp128[0],6);
yp1[j] = vgetq_lane_s16(yp128[0],7);
yp2[j] = vgetq_lane_s16(yp128[1],0);
j=pi2_p[3];
s[j] = vgetq_lane_s16(yp128[1],1);
yp1[j] = vgetq_lane_s16(yp128[1],2);
yp2[j] = vgetq_lane_s16(yp128[1],3);
j=pi2_p[4];
s[j] = vgetq_lane_s16(yp128[1],4);
yp1[j] = vgetq_lane_s16(yp128[1],5);
yp2[j] = vgetq_lane_s16(yp128[1],6);
j=pi2_p[5];
s[j] = vgetq_lane_s16(yp128[1],7);
yp1[j] = vgetq_lane_s16(yp128[2],0);
yp2[j] = vgetq_lane_s16(yp128[2],1);
j=pi2_p[6];
s[j] = vgetq_lane_s16(yp128[2],2);
yp1[j] = vgetq_lane_s16(yp128[2],3);
yp2[j] = vgetq_lane_s16(yp128[2],4);
j=pi2_p[7];
s[j] = vgetq_lane_s16(yp128[2],5);
yp1[j] = vgetq_lane_s16(yp128[2],6);
yp2[j] = vgetq_lane_s16(yp128[2],7);
#endif
yp128+=3;
}
yp=(llr_t*)yp128;
yp=(llr_t *)yp128;
// Termination
for (i=n; i<n+3; i++) {
......@@ -1410,7 +1275,7 @@ uint8_t phy_threegpplte_turbo_decoder16(int16_t *y,
yp1[i] = *yp;
yp++;
#ifdef DEBUG_LOGMAP
fprintf(fdsse4,"Term 1 (%d): %d %d\n",i,s[i],yp1[i]);
fprintf(fdsse4,"Term 1 (%u): %d %d\n",i,s[i],yp1[i]);
#endif //DEBUG_LOGMAP
}
......@@ -1422,32 +1287,25 @@ uint8_t phy_threegpplte_turbo_decoder16(int16_t *y,
yp2[i-8] = *yp;
yp++;
#ifdef DEBUG_LOGMAP
fprintf(fdsse4,"Term 2 (%d): %d %d\n",i-3,s[i],yp2[i-8]);
fprintf(fdsse4,"Term 2 (%u): %d %d\n",i-3,s[i],yp2[i-8]);
#endif //DEBUG_LOGMAP
}
#ifdef DEBUG_LOGMAP
fprintf(fdsse4,"\n");
#endif //DEBUG_LOGMAP
stop_meas(init_stats);
// do log_map from first parity bit
log_map16(systematic0,yparity1,m11,m10,alpha,beta,ext,n,0,F,offset8_flag,alpha_stats,beta_stats,gamma_stats,ext_stats);
while (iteration_cnt++ < max_iterations) {
#ifdef DEBUG_LOGMAP
fprintf(fdsse4,"\n*******************ITERATION %d (n %d), ext %p\n\n",iteration_cnt,n,ext);
#endif //DEBUG_LOGMAP
start_meas(intl1_stats);
pi4_p=pi4tab16[iind];
for (i=0; i<(n>>3); i++) { // steady-state portion
#if defined(__x86_64__) || defined(__i386__)
((__m128i *)systematic2)[i]=_mm_insert_epi16(((__m128i *)systematic2)[i],ext[*pi4_p++],0);
((__m128i *)systematic2)[i]=_mm_insert_epi16(((__m128i *)systematic2)[i],ext[*pi4_p++],1);
......@@ -1457,30 +1315,24 @@ uint8_t phy_threegpplte_turbo_decoder16(int16_t *y,
((__m128i *)systematic2)[i]=_mm_insert_epi16(((__m128i *)systematic2)[i],ext[*pi4_p++],5);
((__m128i *)systematic2)[i]=_mm_insert_epi16(((__m128i *)systematic2)[i],ext[*pi4_p++],6);
((__m128i *)systematic2)[i]=_mm_insert_epi16(((__m128i *)systematic2)[i],ext[*pi4_p++],7);
#elif defined(__arm__)
((int16x8_t*)systematic2)[i]=vsetq_lane_s16(ext[*pi4_p++],((int16x8_t*)systematic2)[i],0);
((int16x8_t*)systematic2)[i]=vsetq_lane_s16(ext[*pi4_p++],((int16x8_t*)systematic2)[i],1);
((int16x8_t*)systematic2)[i]=vsetq_lane_s16(ext[*pi4_p++],((int16x8_t*)systematic2)[i],2);
((int16x8_t*)systematic2)[i]=vsetq_lane_s16(ext[*pi4_p++],((int16x8_t*)systematic2)[i],3);
((int16x8_t*)systematic2)[i]=vsetq_lane_s16(ext[*pi4_p++],((int16x8_t*)systematic2)[i],4);
((int16x8_t*)systematic2)[i]=vsetq_lane_s16(ext[*pi4_p++],((int16x8_t*)systematic2)[i],5);
((int16x8_t*)systematic2)[i]=vsetq_lane_s16(ext[*pi4_p++],((int16x8_t*)systematic2)[i],6);
((int16x8_t*)systematic2)[i]=vsetq_lane_s16(ext[*pi4_p++],((int16x8_t*)systematic2)[i],7);
((int16x8_t *)systematic2)[i]=vsetq_lane_s16(ext[*pi4_p++],((int16x8_t *)systematic2)[i],0);
((int16x8_t *)systematic2)[i]=vsetq_lane_s16(ext[*pi4_p++],((int16x8_t *)systematic2)[i],1);
((int16x8_t *)systematic2)[i]=vsetq_lane_s16(ext[*pi4_p++],((int16x8_t *)systematic2)[i],2);
((int16x8_t *)systematic2)[i]=vsetq_lane_s16(ext[*pi4_p++],((int16x8_t *)systematic2)[i],3);
((int16x8_t *)systematic2)[i]=vsetq_lane_s16(ext[*pi4_p++],((int16x8_t *)systematic2)[i],4);
((int16x8_t *)systematic2)[i]=vsetq_lane_s16(ext[*pi4_p++],((int16x8_t *)systematic2)[i],5);
((int16x8_t *)systematic2)[i]=vsetq_lane_s16(ext[*pi4_p++],((int16x8_t *)systematic2)[i],6);
((int16x8_t *)systematic2)[i]=vsetq_lane_s16(ext[*pi4_p++],((int16x8_t *)systematic2)[i],7);
#endif
#ifdef DEBUG_LOGMAP
print_shorts("syst2",(int16_t*)&((__m128i *)systematic2)[i]);
print_shorts("syst2",(int16_t *)&((__m128i *)systematic2)[i]);
#endif
}
stop_meas(intl1_stats);
// do log_map from second parity bit
log_map16(systematic2,yparity2,m11,m10,alpha,beta,ext2,n,1,F,offset8_flag,alpha_stats,beta_stats,gamma_stats,ext_stats);
pi5_p=pi5tab16[iind];
for (i=0; i<(n>>3); i++) {
......@@ -1493,7 +1345,7 @@ uint8_t phy_threegpplte_turbo_decoder16(int16_t *y,
tmp=_mm_insert_epi16(tmp,ext2[*pi5_p++],5);
tmp=_mm_insert_epi16(tmp,ext2[*pi5_p++],6);
tmp=_mm_insert_epi16(tmp,ext2[*pi5_p++],7);
((__m128i *)systematic1)[i] = _mm_adds_epi16(_mm_subs_epi16(tmp,((__m128i*)ext)[i]),((__m128i *)systematic0)[i]);
((__m128i *)systematic1)[i] = _mm_adds_epi16(_mm_subs_epi16(tmp,((__m128i *)ext)[i]),((__m128i *)systematic0)[i]);
#elif defined(__arm__)
tmp=vsetq_lane_s16(ext2[*pi5_p++],tmp,0);
tmp=vsetq_lane_s16(ext2[*pi5_p++],tmp,1);
......@@ -1503,10 +1355,10 @@ uint8_t phy_threegpplte_turbo_decoder16(int16_t *y,
tmp=vsetq_lane_s16(ext2[*pi5_p++],tmp,5);
tmp=vsetq_lane_s16(ext2[*pi5_p++],tmp,6);
tmp=vsetq_lane_s16(ext2[*pi5_p++],tmp,7);
((int16x8_t *)systematic1)[i] = vqaddq_s16(vqsubq_s16(tmp,((int16x8_t*)ext)[i]),((int16x8_t *)systematic0)[i]);
((int16x8_t *)systematic1)[i] = vqaddq_s16(vqsubq_s16(tmp,((int16x8_t *)ext)[i]),((int16x8_t *)systematic0)[i]);
#endif
#ifdef DEBUG_LOGMAP
print_shorts("syst1",(int16_t*)&((__m128i *)systematic1)[i]);
print_shorts("syst1",(int16_t *)&((__m128i *)systematic1)[i]);
#endif
}
......@@ -1516,16 +1368,16 @@ uint8_t phy_threegpplte_turbo_decoder16(int16_t *y,
for (i=0; i<(n>>3); i++) {
#if defined(__x86_64__) || defined(__i386__)
tmp=_mm_insert_epi16(tmp, ((llr_t*)ext2)[*pi6_p++],7);
tmp=_mm_insert_epi16(tmp, ((llr_t*)ext2)[*pi6_p++],6);
tmp=_mm_insert_epi16(tmp, ((llr_t*)ext2)[*pi6_p++],5);
tmp=_mm_insert_epi16(tmp, ((llr_t*)ext2)[*pi6_p++],4);
tmp=_mm_insert_epi16(tmp, ((llr_t*)ext2)[*pi6_p++],3);
tmp=_mm_insert_epi16(tmp, ((llr_t*)ext2)[*pi6_p++],2);
tmp=_mm_insert_epi16(tmp, ((llr_t*)ext2)[*pi6_p++],1);
tmp=_mm_insert_epi16(tmp, ((llr_t*)ext2)[*pi6_p++],0);
tmp=_mm_insert_epi16(tmp, ((llr_t *)ext2)[*pi6_p++],7);
tmp=_mm_insert_epi16(tmp, ((llr_t *)ext2)[*pi6_p++],6);
tmp=_mm_insert_epi16(tmp, ((llr_t *)ext2)[*pi6_p++],5);
tmp=_mm_insert_epi16(tmp, ((llr_t *)ext2)[*pi6_p++],4);
tmp=_mm_insert_epi16(tmp, ((llr_t *)ext2)[*pi6_p++],3);
tmp=_mm_insert_epi16(tmp, ((llr_t *)ext2)[*pi6_p++],2);
tmp=_mm_insert_epi16(tmp, ((llr_t *)ext2)[*pi6_p++],1);
tmp=_mm_insert_epi16(tmp, ((llr_t *)ext2)[*pi6_p++],0);
#ifdef DEBUG_LOGMAP
print_shorts("tmp",(int16_t*)&tmp);
print_shorts("tmp",(int16_t *)&tmp);
#endif
tmp=_mm_cmpgt_epi8(_mm_packs_epi16(tmp,zeros),zeros);
decoded_bytes[i]=(unsigned char)_mm_movemask_epi8(tmp);
......@@ -1538,18 +1390,18 @@ uint8_t phy_threegpplte_turbo_decoder16(int16_t *y,
tmp=vsetq_lane_s16(ext2[*pi6_p++],tmp,2);
tmp=vsetq_lane_s16(ext2[*pi6_p++],tmp,1);
tmp=vsetq_lane_s16(ext2[*pi6_p++],tmp,0);
// This does:
// [1 2 4 8 16 32 64 128] .* I(ext_i > 0) = 2.^[b0 b1 b2 b3 b4 b5 b6 b7], where bi =I(ext_i > 0)
// [2^b0 + 2^b1 2^b2 + 2^b3 2^b4 + 2^b5 2^b6 + 2^b7]
// [2^b0 + 2^b1 + 2^b2 + 2^b3 2^b4 + 2^b5 + 2^b6 + 2^b7]
// Mask64 = 2^b0 + 2^b1 + 2^b2 + 2^b3 + 2^b4 + 2^b5 + 2^b6 + 2^b7
// This does:
// [1 2 4 8 16 32 64 128] .* I(ext_i > 0) = 2.^[b0 b1 b2 b3 b4 b5 b6 b7], where bi =I(ext_i > 0)
// [2^b0 + 2^b1 2^b2 + 2^b3 2^b4 + 2^b5 2^b6 + 2^b7]
// [2^b0 + 2^b1 + 2^b2 + 2^b3 2^b4 + 2^b5 + 2^b6 + 2^b7]
// Mask64 = 2^b0 + 2^b1 + 2^b2 + 2^b3 + 2^b4 + 2^b5 + 2^b6 + 2^b7
uint64x2_t Mask = vpaddlq_u32(vpaddlq_u16(vandq_u16(vcgtq_s16(tmp,zeros), Powers)));
uint64x1_t Mask64 = vget_high_u64(Mask)+vget_low_u64(Mask);
decoded_bytes[i] = (uint8_t)Mask64;
#endif
#ifdef DEBUG_LOGMAP
print_shorts("tmp",(int16_t*)&tmp);
fprintf(fdsse4,"decoded_bytes[%d] %x\n",i,decoded_bytes[i]);
print_shorts("tmp",(int16_t *)&tmp);
fprintf(fdsse4,"decoded_bytes[%u] %x\n",i,decoded_bytes[i]);
#endif
}
}
......@@ -1559,7 +1411,6 @@ uint8_t phy_threegpplte_turbo_decoder16(int16_t *y,
oldcrc= *((unsigned int *)(&decoded_bytes[(n>>3)-crc_len]));
switch (crc_type) {
case CRC24_A:
oldcrc&=0x00ffffff;
crc = crc24a(&decoded_bytes[F>>3],
......@@ -1610,13 +1461,13 @@ uint8_t phy_threegpplte_turbo_decoder16(int16_t *y,
if (iteration_cnt < max_iterations) {
log_map16(systematic1,yparity1,m11,m10,alpha,beta,ext,n,0,F,offset8_flag,alpha_stats,beta_stats,gamma_stats,ext_stats);
#if defined(__x86_64__) || defined(__i386__)
__m128i* ext_128=(__m128i*) ext;
__m128i* s1_128=(__m128i*) systematic1;
__m128i* s0_128=(__m128i*) systematic0;
__m128i *ext_128=(__m128i *) ext;
__m128i *s1_128=(__m128i *) systematic1;
__m128i *s0_128=(__m128i *) systematic0;
#elif defined(__arm__)
int16x8_t* ext_128=(int16x8_t*) ext;
int16x8_t* s1_128=(int16x8_t*) systematic1;
int16x8_t* s0_128=(int16x8_t*) systematic0;
int16x8_t *ext_128=(int16x8_t *) ext;
int16x8_t *s1_128=(int16x8_t *) systematic1;
int16x8_t *s0_128=(int16x8_t *) systematic0;
#endif
int myloop=n>>3;
......@@ -1632,11 +1483,9 @@ uint8_t phy_threegpplte_turbo_decoder16(int16_t *y,
}
// fprintf(fdsse4,"crc %x, oldcrc %x\n",crc,oldcrc);
#ifdef DEBUG_LOGMAP
fclose(fdsse4);
#endif
#if defined(__x86_64__) || defined(__i386__)
_mm_empty();
_m_empty();
......
......@@ -39,19 +39,19 @@
#include "PHY/sse_intrin.h"
#ifndef TEST_DEBUG
#include "PHY/defs_common.h"
#include "PHY/CODING/coding_defs.h"
#include "PHY/CODING/lte_interleaver_inline.h"
#include "PHY/defs_common.h"
#include "PHY/CODING/coding_defs.h"
#include "PHY/CODING/lte_interleaver_inline.h"
#else
#include "defs.h"
#include <stdio.h>
#include <stdlib.h>
#include <string.h>
#include "defs.h"
#include <stdio.h>
#include <stdlib.h>
#include <string.h>
#endif
#ifdef MEX
#include "mex.h"
#include "mex.h"
#endif
#include "common/ran_context.h"
......@@ -86,32 +86,28 @@ typedef int8_t channel_t;
#define MAX8 127
void log_map8(llr_t* systematic,channel_t* y_parity, llr_t* m11, llr_t* m10, llr_t *alpha, llr_t *beta, llr_t* ext,unsigned short frame_length,unsigned char term_flag,unsigned char F,int offset8_flag,
void log_map8(llr_t *systematic,channel_t *y_parity, llr_t *m11, llr_t *m10, llr_t *alpha, llr_t *beta, llr_t *ext,unsigned short frame_length,unsigned char term_flag,unsigned char F,int offset8_flag,
time_stats_t *alpha_stats,time_stats_t *beta_stats,time_stats_t *gamma_stats,time_stats_t *ext_stats);
void compute_gamma8(llr_t* m11,llr_t* m10,llr_t* systematic, channel_t* y_parity, unsigned short frame_length,unsigned char term_flag);
void compute_alpha8(llr_t*alpha,llr_t *beta, llr_t* m11,llr_t* m10, unsigned short frame_length,unsigned char F);
void compute_beta8(llr_t*alpha, llr_t* beta,llr_t* m11,llr_t* m10, unsigned short frame_length,unsigned char F,int offset8_flag);
void compute_ext8(llr_t* alpha,llr_t* beta,llr_t* m11,llr_t* m10,llr_t* extrinsic, llr_t* ap, unsigned short frame_length);
void print_bytes(char *s, int8_t *x)
{
void compute_gamma8(llr_t *m11,llr_t *m10,llr_t *systematic, channel_t *y_parity, unsigned short frame_length,unsigned char term_flag);
void compute_alpha8(llr_t *alpha,llr_t *beta, llr_t *m11,llr_t *m10, unsigned short frame_length,unsigned char F);
void compute_beta8(llr_t *alpha, llr_t *beta,llr_t *m11,llr_t *m10, unsigned short frame_length,unsigned char F,int offset8_flag);
void compute_ext8(llr_t *alpha,llr_t *beta,llr_t *m11,llr_t *m10,llr_t *extrinsic, llr_t *ap, unsigned short frame_length);
void print_bytes(char *s, int8_t *x) {
printf("%s : %d,%d,%d,%d,%d,%d,%d,%d,%d,%d,%d,%d,%d,%d,%d,%d\n",s,
x[0],x[1],x[2],x[3],x[4],x[5],x[6],x[7],
x[8],x[9],x[10],x[11],x[12],x[13],x[14],x[15]);
}
void log_map8(llr_t* systematic,
channel_t* y_parity,
llr_t* m11,
llr_t* m10,
void log_map8(llr_t *systematic,
channel_t *y_parity,
llr_t *m11,
llr_t *m10,
llr_t *alpha,
llr_t *beta,
llr_t* ext,
llr_t *ext,
unsigned short frame_length,
unsigned char term_flag,
unsigned char F,
......@@ -119,32 +115,38 @@ void log_map8(llr_t* systematic,
time_stats_t *alpha_stats,
time_stats_t *beta_stats,
time_stats_t *gamma_stats,
time_stats_t *ext_stats)
{
time_stats_t *ext_stats) {
#ifdef DEBUG_LOGMAP
printf("log_map, frame_length %d\n",frame_length);
#endif
if (gamma_stats) start_meas(gamma_stats) ;
compute_gamma8(m11,m10,systematic,y_parity,frame_length,term_flag) ;
if (gamma_stats) stop_meas(gamma_stats);
if (alpha_stats) start_meas(alpha_stats) ;
compute_alpha8(alpha,beta,m11,m10,frame_length,F) ;
if (alpha_stats) stop_meas(alpha_stats);
if (beta_stats) start_meas(beta_stats) ;
compute_beta8(alpha,beta,m11,m10,frame_length,F,offset8_flag) ;
if (beta_stats) stop_meas(beta_stats);
if (ext_stats) start_meas(ext_stats) ;
compute_ext8(alpha,beta,m11,m10,ext,systematic,frame_length) ;
if (ext_stats) stop_meas(ext_stats);
compute_ext8(alpha,beta,m11,m10,ext,systematic,frame_length) ;
if (ext_stats) stop_meas(ext_stats);
}
void compute_gamma8(llr_t* m11,llr_t* m10,llr_t* systematic,channel_t* y_parity,
unsigned short frame_length,unsigned char term_flag)
{
void compute_gamma8(llr_t *m11,llr_t *m10,llr_t *systematic,channel_t *y_parity,
unsigned short frame_length,unsigned char term_flag) {
int k,K1;
#if defined(__x86_64__)||defined(__i386__)
__m128i *systematic128 = (__m128i *)systematic;
......@@ -157,11 +159,9 @@ void compute_gamma8(llr_t* m11,llr_t* m10,llr_t* systematic,channel_t* y_parity,
int8x16_t *m10_128 = (int8x16_t *)m10;
int8x16_t *m11_128 = (int8x16_t *)m11;
#endif
#ifdef DEBUG_LOGMAP
printf("compute_gamma, %p,%p,%p,%p,framelength %d\n",m11,m10,systematic,y_parity,frame_length);
#endif
#if defined(__x86_64__) || defined(__i386__)
register __m128i sl,sh,ypl,yph; //K128=_mm_set1_epi8(-128);
#endif
......@@ -181,11 +181,9 @@ void compute_gamma8(llr_t* m11,llr_t* m10,llr_t* systematic,channel_t* y_parity,
m11_128[k] = vhaddq_s8(systematic128[k],y_parity128[k]);
m10_128[k] = vhsubq_s8(systematic128[k],y_parity128[k]);
#endif
}
// Termination
#if defined(__x86_64__) || defined(__i386__)
sl = _mm_cvtepi8_epi16(systematic128[k+term_flag]);
sh = _mm_cvtepi8_epi16(_mm_srli_si128(systematic128[k],8));
......@@ -199,15 +197,12 @@ void compute_gamma8(llr_t* m11,llr_t* m10,llr_t* systematic,channel_t* y_parity,
m11_128[k] = vhaddq_s8(systematic128[k+term_flag],y_parity128[k]);
m10_128[k] = vhsubq_s8(systematic128[k+term_flag],y_parity128[k]);
#endif
}
#define L 16
void compute_alpha8(llr_t* alpha,llr_t* beta,llr_t* m_11,llr_t* m_10,unsigned short frame_length,unsigned char F)
{
void compute_alpha8(llr_t *alpha,llr_t *beta,llr_t *m_11,llr_t *m_10,unsigned short frame_length,unsigned char F) {
int k,loopval,rerun_flag;
#if defined(__x86_64__) || defined(__i386__)
__m128i *alpha128=(__m128i *)alpha,*alpha_ptr;
__m128i *m11p,*m10p;
......@@ -223,7 +218,6 @@ void compute_alpha8(llr_t* alpha,llr_t* beta,llr_t* m_11,llr_t* m_10,unsigned sh
#endif
// Set initial state: first colum is known
// the other columns are unknown, so all states are set to same value
#if defined(__x86_64__) || defined(__i386__)
alpha128[0] = _mm_set_epi8(-MAX8/2,-MAX8/2,-MAX8/2,-MAX8/2,-MAX8/2,-MAX8/2,-MAX8/2,-MAX8/2,-MAX8/2,-MAX8/2,-MAX8/2,-MAX8/2,-MAX8/2,-MAX8/2,-MAX8/2,0);
alpha128[1] = _mm_set_epi8(-MAX8/2,-MAX8/2,-MAX8/2,-MAX8/2,-MAX8/2,-MAX8/2,-MAX8/2,-MAX8/2,-MAX8/2,-MAX8/2,-MAX8/2,-MAX8/2,-MAX8/2,-MAX8/2,-MAX8/2,-MAX8/2);
......@@ -233,12 +227,11 @@ void compute_alpha8(llr_t* alpha,llr_t* beta,llr_t* m_11,llr_t* m_10,unsigned sh
alpha128[5] = _mm_set_epi8(-MAX8/2,-MAX8/2,-MAX8/2,-MAX8/2,-MAX8/2,-MAX8/2,-MAX8/2,-MAX8/2,-MAX8/2,-MAX8/2,-MAX8/2,-MAX8/2,-MAX8/2,-MAX8/2,-MAX8/2,-MAX8/2);
alpha128[6] = _mm_set_epi8(-MAX8/2,-MAX8/2,-MAX8/2,-MAX8/2,-MAX8/2,-MAX8/2,-MAX8/2,-MAX8/2,-MAX8/2,-MAX8/2,-MAX8/2,-MAX8/2,-MAX8/2,-MAX8/2,-MAX8/2,-MAX8/2);
alpha128[7] = _mm_set_epi8(-MAX8/2,-MAX8/2,-MAX8/2,-MAX8/2,-MAX8/2,-MAX8/2,-MAX8/2,-MAX8/2,-MAX8/2,-MAX8/2,-MAX8/2,-MAX8/2,-MAX8/2,-MAX8/2,-MAX8/2,-MAX8/2);
for (loopval=frame_length>>4, rerun_flag=0; rerun_flag<2; loopval=L, rerun_flag++) {
for (loopval=frame_length>>4, rerun_flag=0; rerun_flag<2; loopval=L, rerun_flag++) {
alpha_ptr = &alpha128[0];
m11p = (__m128i*)m_11;
m10p = (__m128i*)m_10;
m11p = (__m128i *)m_11;
m10p = (__m128i *)m_10;
for (k=0; k<loopval; k++) {
m_b0 = _mm_adds_epi8(alpha_ptr[1],*m11p); // m11
......@@ -249,7 +242,6 @@ void compute_alpha8(llr_t* alpha,llr_t* beta,llr_t* m_11,llr_t* m_10,unsigned sh
m_b6 = _mm_subs_epi8(alpha_ptr[5],*m10p); // m01=-m10
m_b3 = _mm_subs_epi8(alpha_ptr[7],*m11p); // m00=-m11
m_b7 = _mm_adds_epi8(alpha_ptr[7],*m11p); // m11
new0 = _mm_subs_epi8(alpha_ptr[0],*m11p); // m00=-m11
new4 = _mm_adds_epi8(alpha_ptr[0],*m11p); // m11
new1 = _mm_adds_epi8(alpha_ptr[2],*m10p); // m10
......@@ -258,7 +250,6 @@ void compute_alpha8(llr_t* alpha,llr_t* beta,llr_t* m_11,llr_t* m_10,unsigned sh
new6 = _mm_adds_epi8(alpha_ptr[4],*m10p); // m10
new3 = _mm_adds_epi8(alpha_ptr[6],*m11p); // m11
new7 = _mm_subs_epi8(alpha_ptr[6],*m11p); // m00=-m11
alpha_ptr += 8;
m11p++;
m10p++;
......@@ -270,7 +261,6 @@ void compute_alpha8(llr_t* alpha,llr_t* beta,llr_t* m_11,llr_t* m_10,unsigned sh
alpha_ptr[5] = _mm_max_epi8(m_b5,new5);
alpha_ptr[6] = _mm_max_epi8(m_b6,new6);
alpha_ptr[7] = _mm_max_epi8(m_b7,new7);
// compute and subtract maxima
alpha_max = _mm_max_epi8(alpha_ptr[0],alpha_ptr[1]);
alpha_max = _mm_max_epi8(alpha_max,alpha_ptr[2]);
......@@ -279,7 +269,6 @@ void compute_alpha8(llr_t* alpha,llr_t* beta,llr_t* m_11,llr_t* m_10,unsigned sh
alpha_max = _mm_max_epi8(alpha_max,alpha_ptr[5]);
alpha_max = _mm_max_epi8(alpha_max,alpha_ptr[6]);
alpha_max = _mm_max_epi8(alpha_max,alpha_ptr[7]);
alpha_ptr[0] = _mm_subs_epi8(alpha_ptr[0],alpha_max);
alpha_ptr[1] = _mm_subs_epi8(alpha_ptr[1],alpha_max);
alpha_ptr[2] = _mm_subs_epi8(alpha_ptr[2],alpha_max);
......@@ -308,8 +297,8 @@ void compute_alpha8(llr_t* alpha,llr_t* beta,llr_t* m_11,llr_t* m_10,unsigned sh
alpha[80] = -MAX8/2;
alpha[96] = -MAX8/2;
alpha[112] = -MAX8/2;
}
#elif defined(__arm__)
alpha128[0] = vdupq_n_s8(-MAX8/2);
alpha128[0] = vsetq_lane_s8(0,alpha128[0],0);
......@@ -320,12 +309,11 @@ void compute_alpha8(llr_t* alpha,llr_t* beta,llr_t* m_11,llr_t* m_10,unsigned sh
alpha128[5] = vdupq_n_s8(-MAX8/2);
alpha128[6] = vdupq_n_s8(-MAX8/2);
alpha128[7] = vdupq_n_s8(-MAX8/2);
for (loopval=frame_length>>4, rerun_flag=0; rerun_flag<2; loopval=L, rerun_flag++) {
for (loopval=frame_length>>4, rerun_flag=0; rerun_flag<2; loopval=L, rerun_flag++) {
alpha_ptr = &alpha128[0];
m11p = (int8x16_t*)m_11;
m10p = (int8x16_t*)m_10;
m11p = (int8x16_t *)m_11;
m10p = (int8x16_t *)m_10;
for (k=0; k<loopval; k++) {
m_b0 = vqaddq_s8(alpha_ptr[1],*m11p); // m11
......@@ -336,7 +324,6 @@ void compute_alpha8(llr_t* alpha,llr_t* beta,llr_t* m_11,llr_t* m_10,unsigned sh
m_b6 = vqsubq_s8(alpha_ptr[5],*m10p); // m01=-m10
m_b3 = vqsubq_s8(alpha_ptr[7],*m11p); // m00=-m11
m_b7 = vqaddq_s8(alpha_ptr[7],*m11p); // m11
new0 = vqsubq_s8(alpha_ptr[0],*m11p); // m00=-m11
new4 = vqaddq_s8(alpha_ptr[0],*m11p); // m11
new1 = vqaddq_s8(alpha_ptr[2],*m10p); // m10
......@@ -345,7 +332,6 @@ void compute_alpha8(llr_t* alpha,llr_t* beta,llr_t* m_11,llr_t* m_10,unsigned sh
new6 = vqaddq_s8(alpha_ptr[4],*m10p); // m10
new3 = vqaddq_s8(alpha_ptr[6],*m11p); // m11
new7 = vqsubq_s8(alpha_ptr[6],*m11p); // m00=-m11
alpha_ptr += 8;
m11p++;
m10p++;
......@@ -357,7 +343,6 @@ void compute_alpha8(llr_t* alpha,llr_t* beta,llr_t* m_11,llr_t* m_10,unsigned sh
alpha_ptr[5] = vmaxq_s8(m_b5,new5);
alpha_ptr[6] = vmaxq_s8(m_b6,new6);
alpha_ptr[7] = vmaxq_s8(m_b7,new7);
// compute and subtract maxima
alpha_max = vmaxq_s8(alpha_ptr[0],alpha_ptr[1]);
alpha_max = vmaxq_s8(alpha_max,alpha_ptr[2]);
......@@ -366,7 +351,6 @@ void compute_alpha8(llr_t* alpha,llr_t* beta,llr_t* m_11,llr_t* m_10,unsigned sh
alpha_max = vmaxq_s8(alpha_max,alpha_ptr[5]);
alpha_max = vmaxq_s8(alpha_max,alpha_ptr[6]);
alpha_max = vmaxq_s8(alpha_max,alpha_ptr[7]);
alpha_ptr[0] = vqsubq_s8(alpha_ptr[0],alpha_max);
alpha_ptr[1] = vqsubq_s8(alpha_ptr[1],alpha_max);
alpha_ptr[2] = vqsubq_s8(alpha_ptr[2],alpha_max);
......@@ -380,14 +364,22 @@ void compute_alpha8(llr_t* alpha,llr_t* beta,llr_t* m_11,llr_t* m_10,unsigned sh
// Set intial state for next iteration from the last state
// as a column end states are the first states of the next column
int K1= frame_length>>1;
alpha128[0] = (int8x16_t)vshlq_n_s64((int64x2_t)alpha128[K1],8); alpha128[0] = vsetq_lane_s8(alpha[8],alpha128[0],7);
alpha128[1] = (int8x16_t)vshlq_n_s64((int64x2_t)alpha128[1+K1],8); alpha128[1] = vsetq_lane_s8(alpha[24],alpha128[0],7);
alpha128[2] = (int8x16_t)vshlq_n_s64((int64x2_t)alpha128[2+K1],8); alpha128[2] = vsetq_lane_s8(alpha[40],alpha128[0],7);
alpha128[3] = (int8x16_t)vshlq_n_s64((int64x2_t)alpha128[3+K1],8); alpha128[3] = vsetq_lane_s8(alpha[56],alpha128[0],7);
alpha128[4] = (int8x16_t)vshlq_n_s64((int64x2_t)alpha128[4+K1],8); alpha128[4] = vsetq_lane_s8(alpha[72],alpha128[0],7);
alpha128[5] = (int8x16_t)vshlq_n_s64((int64x2_t)alpha128[5+K1],8); alpha128[5] = vsetq_lane_s8(alpha[88],alpha128[0],7);
alpha128[6] = (int8x16_t)vshlq_n_s64((int64x2_t)alpha128[6+K1],8); alpha128[6] = vsetq_lane_s8(alpha[104],alpha128[0],7);
alpha128[7] = (int8x16_t)vshlq_n_s64((int64x2_t)alpha128[7+K1],8); alpha128[7] = vsetq_lane_s8(alpha[120],alpha128[0],7);
alpha128[0] = (int8x16_t)vshlq_n_s64((int64x2_t)alpha128[K1],8);
alpha128[0] = vsetq_lane_s8(alpha[8],alpha128[0],7);
alpha128[1] = (int8x16_t)vshlq_n_s64((int64x2_t)alpha128[1+K1],8);
alpha128[1] = vsetq_lane_s8(alpha[24],alpha128[0],7);
alpha128[2] = (int8x16_t)vshlq_n_s64((int64x2_t)alpha128[2+K1],8);
alpha128[2] = vsetq_lane_s8(alpha[40],alpha128[0],7);
alpha128[3] = (int8x16_t)vshlq_n_s64((int64x2_t)alpha128[3+K1],8);
alpha128[3] = vsetq_lane_s8(alpha[56],alpha128[0],7);
alpha128[4] = (int8x16_t)vshlq_n_s64((int64x2_t)alpha128[4+K1],8);
alpha128[4] = vsetq_lane_s8(alpha[72],alpha128[0],7);
alpha128[5] = (int8x16_t)vshlq_n_s64((int64x2_t)alpha128[5+K1],8);
alpha128[5] = vsetq_lane_s8(alpha[88],alpha128[0],7);
alpha128[6] = (int8x16_t)vshlq_n_s64((int64x2_t)alpha128[6+K1],8);
alpha128[6] = vsetq_lane_s8(alpha[104],alpha128[0],7);
alpha128[7] = (int8x16_t)vshlq_n_s64((int64x2_t)alpha128[7+K1],8);
alpha128[7] = vsetq_lane_s8(alpha[120],alpha128[0],7);
alpha[16] = -MAX8/2;
alpha[32] = -MAX8/2;
alpha[48] = -MAX8/2;
......@@ -395,35 +387,28 @@ void compute_alpha8(llr_t* alpha,llr_t* beta,llr_t* m_11,llr_t* m_10,unsigned sh
alpha[80] = -MAX8/2;
alpha[96] = -MAX8/2;
alpha[112] = -MAX8/2;
}
#endif
#endif
}
void compute_beta8(llr_t* alpha,llr_t* beta,llr_t *m_11,llr_t* m_10,unsigned short frame_length,unsigned char F,int offset8_flag)
{
void compute_beta8(llr_t *alpha,llr_t *beta,llr_t *m_11,llr_t *m_10,unsigned short frame_length,unsigned char F,int offset8_flag) {
int k,rerun_flag, loopval;
#if defined(__x86_64__) || defined(__i386__)
__m128i m11_128,m10_128;
__m128i m_b0,m_b1,m_b2,m_b3,m_b4,m_b5,m_b6,m_b7;
__m128i new0,new1,new2,new3,new4,new5,new6,new7;
__m128i *beta128,*alpha128,*beta_ptr;
__m128i beta_max;
#elif defined(__arm__)
int8x16_t m11_128,m10_128;
int8x16_t m_b0,m_b1,m_b2,m_b3,m_b4,m_b5,m_b6,m_b7;
int8x16_t new0,new1,new2,new3,new4,new5,new6,new7;
int8x16_t *beta128,*alpha128,*beta_ptr;
int8x16_t beta_max;
#endif
llr_t beta0,beta1;
llr_t beta2,beta3,beta4,beta5,beta6,beta7;
if (frame_length > 6144) {
......@@ -433,13 +418,12 @@ void compute_beta8(llr_t* alpha,llr_t* beta,llr_t *m_11,llr_t* m_10,unsigned sho
// we are supposed to run compute_alpha just before compute_beta
// so the initial states of backward computation can be set from last value of alpha states (forward computation)
#if defined(__x86_64__) || defined(__i386__)
beta_ptr = (__m128i*)&beta[frame_length<<3];
alpha128 = (__m128i*)&alpha[0];
beta_ptr = (__m128i *)&beta[frame_length<<3];
alpha128 = (__m128i *)&alpha[0];
#elif defined(__arm__)
beta_ptr = (int8x16_t*)&beta[frame_length<<3];
alpha128 = (int8x16_t*)&alpha[0];
beta_ptr = (int8x16_t *)&beta[frame_length<<3];
alpha128 = (int8x16_t *)&alpha[0];
#endif
beta_ptr[0] = alpha128[(frame_length>>1)];
beta_ptr[1] = alpha128[1+(frame_length>>1)];
......@@ -449,18 +433,15 @@ void compute_beta8(llr_t* alpha,llr_t* beta,llr_t *m_11,llr_t* m_10,unsigned sho
beta_ptr[5] = alpha128[5+(frame_length>>1)];
beta_ptr[6] = alpha128[6+(frame_length>>1)];
beta_ptr[7] = alpha128[7+(frame_length>>1)];
int overlap = (frame_length>>4)> L ? (frame_length>>4)-L : 0 ;
for (rerun_flag=0, loopval=0;
rerun_flag<2 ;
loopval=overlap,rerun_flag++) {
if (offset8_flag==0) {
// FIXME! beta0-beta7 are used uninitialized. FIXME!
// workaround: init with 0
beta0 = beta1 = beta2 = beta3 = beta4 = beta5 = beta6 = beta7 = 0;
#if defined(__x86_64__) || defined(__i386__)
beta_ptr[0] = _mm_insert_epi8(beta_ptr[0],beta0,15);
beta_ptr[1] = _mm_insert_epi8(beta_ptr[1],beta1,15);
......@@ -483,16 +464,17 @@ void compute_beta8(llr_t* alpha,llr_t* beta,llr_t *m_11,llr_t* m_10,unsigned sho
}
#if defined(__x86_64__) || defined(__i386__)
beta_ptr = (__m128i*)&beta[frame_length<<3];
beta_ptr = (__m128i *)&beta[frame_length<<3];
#elif defined(__arm__)
beta_ptr = (int8x16_t*)&beta[frame_length<<3];
beta_ptr = (int8x16_t *)&beta[frame_length<<3];
#endif
for (k=(frame_length>>4)-1;
k>=loopval;
k--) {
#if defined(__x86_64__) || defined(__i386__)
m11_128=((__m128i*)m_11)[k];
m10_128=((__m128i*)m_10)[k];
m11_128=((__m128i *)m_11)[k];
m10_128=((__m128i *)m_10)[k];
m_b0 = _mm_adds_epi8(beta_ptr[4],m11_128); //m11
m_b1 = _mm_subs_epi8(beta_ptr[4],m11_128); //m00
m_b2 = _mm_subs_epi8(beta_ptr[5],m10_128); //m01
......@@ -501,7 +483,6 @@ void compute_beta8(llr_t* alpha,llr_t* beta,llr_t *m_11,llr_t* m_10,unsigned sho
m_b5 = _mm_subs_epi8(beta_ptr[6],m10_128); //m01
m_b6 = _mm_subs_epi8(beta_ptr[7],m11_128); //m00
m_b7 = _mm_adds_epi8(beta_ptr[7],m11_128); //m11
new0 = _mm_subs_epi8(beta_ptr[0],m11_128); //m00
new1 = _mm_adds_epi8(beta_ptr[0],m11_128); //m11
new2 = _mm_adds_epi8(beta_ptr[1],m10_128); //m10
......@@ -510,9 +491,7 @@ void compute_beta8(llr_t* alpha,llr_t* beta,llr_t *m_11,llr_t* m_10,unsigned sho
new5 = _mm_adds_epi8(beta_ptr[2],m10_128); //m10
new6 = _mm_adds_epi8(beta_ptr[3],m11_128); //m11
new7 = _mm_subs_epi8(beta_ptr[3],m11_128); //m00
beta_ptr-=8;
beta_ptr[0] = _mm_max_epi8(m_b0,new0);
beta_ptr[1] = _mm_max_epi8(m_b1,new1);
beta_ptr[2] = _mm_max_epi8(m_b2,new2);
......@@ -521,7 +500,6 @@ void compute_beta8(llr_t* alpha,llr_t* beta,llr_t *m_11,llr_t* m_10,unsigned sho
beta_ptr[5] = _mm_max_epi8(m_b5,new5);
beta_ptr[6] = _mm_max_epi8(m_b6,new6);
beta_ptr[7] = _mm_max_epi8(m_b7,new7);
beta_max = _mm_max_epi8(beta_ptr[0],beta_ptr[1]);
beta_max = _mm_max_epi8(beta_max ,beta_ptr[2]);
beta_max = _mm_max_epi8(beta_max ,beta_ptr[3]);
......@@ -529,7 +507,6 @@ void compute_beta8(llr_t* alpha,llr_t* beta,llr_t *m_11,llr_t* m_10,unsigned sho
beta_max = _mm_max_epi8(beta_max ,beta_ptr[5]);
beta_max = _mm_max_epi8(beta_max ,beta_ptr[6]);
beta_max = _mm_max_epi8(beta_max ,beta_ptr[7]);
beta_ptr[0] = _mm_subs_epi8(beta_ptr[0],beta_max);
beta_ptr[1] = _mm_subs_epi8(beta_ptr[1],beta_max);
beta_ptr[2] = _mm_subs_epi8(beta_ptr[2],beta_max);
......@@ -539,8 +516,8 @@ void compute_beta8(llr_t* alpha,llr_t* beta,llr_t *m_11,llr_t* m_10,unsigned sho
beta_ptr[6] = _mm_subs_epi8(beta_ptr[6],beta_max);
beta_ptr[7] = _mm_subs_epi8(beta_ptr[7],beta_max);
#elif defined(__arm__)
m11_128=((int8x16_t*)m_11)[k];
m10_128=((int8x16_t*)m_10)[k];
m11_128=((int8x16_t *)m_11)[k];
m10_128=((int8x16_t *)m_10)[k];
m_b0 = vqaddq_s8(beta_ptr[4],m11_128); //m11
m_b1 = vqsubq_s8(beta_ptr[4],m11_128); //m00
m_b2 = vqsubq_s8(beta_ptr[5],m10_128); //m01
......@@ -549,7 +526,6 @@ void compute_beta8(llr_t* alpha,llr_t* beta,llr_t *m_11,llr_t* m_10,unsigned sho
m_b5 = vqsubq_s8(beta_ptr[6],m10_128); //m01
m_b6 = vqsubq_s8(beta_ptr[7],m11_128); //m00
m_b7 = vqaddq_s8(beta_ptr[7],m11_128); //m11
new0 = vqsubq_s8(beta_ptr[0],m11_128); //m00
new1 = vqaddq_s8(beta_ptr[0],m11_128); //m11
new2 = vqaddq_s8(beta_ptr[1],m10_128); //m10
......@@ -558,9 +534,7 @@ void compute_beta8(llr_t* alpha,llr_t* beta,llr_t *m_11,llr_t* m_10,unsigned sho
new5 = vqaddq_s8(beta_ptr[2],m10_128); //m10
new6 = vqaddq_s8(beta_ptr[3],m11_128); //m11
new7 = vqsubq_s8(beta_ptr[3],m11_128); //m00
beta_ptr-=8;
beta_ptr[0] = vmaxq_s8(m_b0,new0);
beta_ptr[1] = vmaxq_s8(m_b1,new1);
beta_ptr[2] = vmaxq_s8(m_b2,new2);
......@@ -569,7 +543,6 @@ void compute_beta8(llr_t* alpha,llr_t* beta,llr_t *m_11,llr_t* m_10,unsigned sho
beta_ptr[5] = vmaxq_s8(m_b5,new5);
beta_ptr[6] = vmaxq_s8(m_b6,new6);
beta_ptr[7] = vmaxq_s8(m_b7,new7);
beta_max = vmaxq_s8(beta_ptr[0],beta_ptr[1]);
beta_max = vmaxq_s8(beta_max ,beta_ptr[2]);
beta_max = vmaxq_s8(beta_max ,beta_ptr[3]);
......@@ -577,7 +550,6 @@ void compute_beta8(llr_t* alpha,llr_t* beta,llr_t *m_11,llr_t* m_10,unsigned sho
beta_max = vmaxq_s8(beta_max ,beta_ptr[5]);
beta_max = vmaxq_s8(beta_max ,beta_ptr[6]);
beta_max = vmaxq_s8(beta_max ,beta_ptr[7]);
beta_ptr[0] = vqsubq_s8(beta_ptr[0],beta_max);
beta_ptr[1] = vqsubq_s8(beta_ptr[1],beta_max);
beta_ptr[2] = vqsubq_s8(beta_ptr[2],beta_max);
......@@ -592,10 +564,9 @@ void compute_beta8(llr_t* alpha,llr_t* beta,llr_t *m_11,llr_t* m_10,unsigned sho
// Set intial state for next iteration from the last state
// as column last states are the first states of the next column
// The initial state of column 0 is coming from tail bits (to be computed)
#if defined(__x86_64__) || defined(__i386__)
beta128 = (__m128i*)&beta[0];
beta_ptr = (__m128i*)&beta[frame_length<<3];
beta128 = (__m128i *)&beta[0];
beta_ptr = (__m128i *)&beta[frame_length<<3];
beta_ptr[0] = _mm_srli_si128(beta128[0],1);
beta_ptr[1] = _mm_srli_si128(beta128[1],1);
beta_ptr[2] = _mm_srli_si128(beta128[2],1);
......@@ -605,23 +576,29 @@ void compute_beta8(llr_t* alpha,llr_t* beta,llr_t *m_11,llr_t* m_10,unsigned sho
beta_ptr[6] = _mm_srli_si128(beta128[6],1);
beta_ptr[7] = _mm_srli_si128(beta128[7],1);
#elif defined(__arm__)
beta128 = (int8x16_t*)&beta[0];
beta_ptr = (int8x16_t*)&beta[frame_length<<3];
beta_ptr[0] = (int8x16_t)vshrq_n_s64((int64x2_t)beta128[0],8); beta_ptr[0] = vsetq_lane_s8(beta[7],beta_ptr[0],8);
beta_ptr[1] = (int8x16_t)vshrq_n_s64((int64x2_t)beta128[1],8); beta_ptr[1] = vsetq_lane_s8(beta[23],beta_ptr[1],8);
beta_ptr[2] = (int8x16_t)vshrq_n_s64((int64x2_t)beta128[2],8); beta_ptr[2] = vsetq_lane_s8(beta[39],beta_ptr[2],8);
beta_ptr[3] = (int8x16_t)vshrq_n_s64((int64x2_t)beta128[3],8); beta_ptr[3] = vsetq_lane_s8(beta[55],beta_ptr[3],8);
beta_ptr[4] = (int8x16_t)vshrq_n_s64((int64x2_t)beta128[4],8); beta_ptr[4] = vsetq_lane_s8(beta[71],beta_ptr[4],8);
beta_ptr[5] = (int8x16_t)vshrq_n_s64((int64x2_t)beta128[5],8); beta_ptr[5] = vsetq_lane_s8(beta[87],beta_ptr[5],8);
beta_ptr[6] = (int8x16_t)vshrq_n_s64((int64x2_t)beta128[6],8); beta_ptr[6] = vsetq_lane_s8(beta[103],beta_ptr[6],8);
beta_ptr[7] = (int8x16_t)vshrq_n_s64((int64x2_t)beta128[7],8); beta_ptr[7] = vsetq_lane_s8(beta[119],beta_ptr[7],8);
beta128 = (int8x16_t *)&beta[0];
beta_ptr = (int8x16_t *)&beta[frame_length<<3];
beta_ptr[0] = (int8x16_t)vshrq_n_s64((int64x2_t)beta128[0],8);
beta_ptr[0] = vsetq_lane_s8(beta[7],beta_ptr[0],8);
beta_ptr[1] = (int8x16_t)vshrq_n_s64((int64x2_t)beta128[1],8);
beta_ptr[1] = vsetq_lane_s8(beta[23],beta_ptr[1],8);
beta_ptr[2] = (int8x16_t)vshrq_n_s64((int64x2_t)beta128[2],8);
beta_ptr[2] = vsetq_lane_s8(beta[39],beta_ptr[2],8);
beta_ptr[3] = (int8x16_t)vshrq_n_s64((int64x2_t)beta128[3],8);
beta_ptr[3] = vsetq_lane_s8(beta[55],beta_ptr[3],8);
beta_ptr[4] = (int8x16_t)vshrq_n_s64((int64x2_t)beta128[4],8);
beta_ptr[4] = vsetq_lane_s8(beta[71],beta_ptr[4],8);
beta_ptr[5] = (int8x16_t)vshrq_n_s64((int64x2_t)beta128[5],8);
beta_ptr[5] = vsetq_lane_s8(beta[87],beta_ptr[5],8);
beta_ptr[6] = (int8x16_t)vshrq_n_s64((int64x2_t)beta128[6],8);
beta_ptr[6] = vsetq_lane_s8(beta[103],beta_ptr[6],8);
beta_ptr[7] = (int8x16_t)vshrq_n_s64((int64x2_t)beta128[7],8);
beta_ptr[7] = vsetq_lane_s8(beta[119],beta_ptr[7],8);
#endif
}
}
void compute_ext8(llr_t* alpha,llr_t* beta,llr_t* m_11,llr_t* m_10,llr_t* ext, llr_t* systematic,unsigned short frame_length)
{
void compute_ext8(llr_t *alpha,llr_t *beta,llr_t *m_11,llr_t *m_10,llr_t *ext, llr_t *systematic,unsigned short frame_length) {
#if defined(__x86_64__) || defined(__i386__)
__m128i *alpha128=(__m128i *)alpha;
__m128i *beta128=(__m128i *)beta;
......@@ -642,27 +619,20 @@ void compute_ext8(llr_t* alpha,llr_t* beta,llr_t* m_11,llr_t* m_10,llr_t* ext, l
int8x16_t m11_1,m11_2,m11_3,m11_4;
#endif
int k;
//
// LLR computation, 8 consequtive bits per loop
//
#ifdef DEBUG_LOGMAP
printf("compute_ext, %p, %p, %p, %p, %p, %p ,framelength %d\n",alpha,beta,m_11,m_10,ext,systematic,frame_length);
#endif
alpha_ptr = alpha128;
beta_ptr = &beta128[8];
for (k=0; k<(frame_length>>4); k++) {
#if defined(__x86_64__) || defined(__i386__)
m11_128 = (__m128i*)&m_11[k<<4];
m10_128 = (__m128i*)&m_10[k<<4];
ext_128 = (__m128i*)&ext[k<<4];
m11_128 = (__m128i *)&m_11[k<<4];
m10_128 = (__m128i *)&m_10[k<<4];
ext_128 = (__m128i *)&ext[k<<4];
m00_4 = _mm_adds_epi8(alpha_ptr[7],beta_ptr[3]); //ALPHA_BETA_4m00;
m11_4 = _mm_adds_epi8(alpha_ptr[7],beta_ptr[7]); //ALPHA_BETA_4m11;
m00_3 = _mm_adds_epi8(alpha_ptr[6],beta_ptr[7]); //ALPHA_BETA_3m00;
......@@ -679,7 +649,6 @@ void compute_ext8(llr_t* alpha,llr_t* beta,llr_t* m_11,llr_t* m_10,llr_t* ext, l
m10_2 = _mm_adds_epi8(alpha_ptr[3],beta_ptr[5]); //ALPHA_BETA_2m10;
m10_1 = _mm_adds_epi8(alpha_ptr[2],beta_ptr[1]); //ALPHA_BETA_1m10;
m01_1 = _mm_adds_epi8(alpha_ptr[2],beta_ptr[5]); //ALPHA_BETA_1m01;
m01_1 = _mm_max_epi8(m01_1,m01_2);
m01_1 = _mm_max_epi8(m01_1,m01_3);
m01_1 = _mm_max_epi8(m01_1,m01_4);
......@@ -692,28 +661,19 @@ void compute_ext8(llr_t* alpha,llr_t* beta,llr_t* m_11,llr_t* m_10,llr_t* ext, l
m11_1 = _mm_max_epi8(m11_1,m11_2);
m11_1 = _mm_max_epi8(m11_1,m11_3);
m11_1 = _mm_max_epi8(m11_1,m11_4);
m01_1 = _mm_subs_epi8(m01_1,*m10_128);
m00_1 = _mm_subs_epi8(m00_1,*m11_128);
m10_1 = _mm_adds_epi8(m10_1,*m10_128);
m11_1 = _mm_adds_epi8(m11_1,*m11_128);
m01_1 = _mm_max_epi8(m01_1,m00_1);
m10_1 = _mm_max_epi8(m10_1,m11_1);
*ext_128 = _mm_subs_epi8(m10_1,m01_1);
alpha_ptr+=8;
beta_ptr+=8;
#elif defined(__arm__)
m11_128 = (int8x16_t*)&m_11[k<<4];
m10_128 = (int8x16_t*)&m_10[k<<4];
ext_128 = (int8x16_t*)&ext[k<<4];
m11_128 = (int8x16_t *)&m_11[k<<4];
m10_128 = (int8x16_t *)&m_10[k<<4];
ext_128 = (int8x16_t *)&ext[k<<4];
m00_4 = vqaddq_s8(alpha_ptr[7],beta_ptr[3]); //ALPHA_BETA_4m00;
m11_4 = vqaddq_s8(alpha_ptr[7],beta_ptr[7]); //ALPHA_BETA_4m11;
m00_3 = vqaddq_s8(alpha_ptr[6],beta_ptr[7]); //ALPHA_BETA_3m00;
......@@ -730,7 +690,6 @@ void compute_ext8(llr_t* alpha,llr_t* beta,llr_t* m_11,llr_t* m_10,llr_t* ext, l
m10_2 = vqaddq_s8(alpha_ptr[3],beta_ptr[5]); //ALPHA_BETA_2m10;
m10_1 = vqaddq_s8(alpha_ptr[2],beta_ptr[1]); //ALPHA_BETA_1m10;
m01_1 = vqaddq_s8(alpha_ptr[2],beta_ptr[5]); //ALPHA_BETA_1m01;
m01_1 = vmaxq_s8(m01_1,m01_2);
m01_1 = vmaxq_s8(m01_1,m01_3);
m01_1 = vmaxq_s8(m01_1,m01_4);
......@@ -743,27 +702,17 @@ void compute_ext8(llr_t* alpha,llr_t* beta,llr_t* m_11,llr_t* m_10,llr_t* ext, l
m11_1 = vmaxq_s8(m11_1,m11_2);
m11_1 = vmaxq_s8(m11_1,m11_3);
m11_1 = vmaxq_s8(m11_1,m11_4);
m01_1 = vqsubq_s8(m01_1,*m10_128);
m00_1 = vqsubq_s8(m00_1,*m11_128);
m10_1 = vqaddq_s8(m10_1,*m10_128);
m11_1 = vqaddq_s8(m11_1,*m11_128);
m01_1 = vmaxq_s8(m01_1,m00_1);
m10_1 = vmaxq_s8(m10_1,m11_1);
*ext_128 = vqsubq_s8(m10_1,m01_1);
alpha_ptr+=8;
beta_ptr+=8;
#endif
}
}
......@@ -771,8 +720,7 @@ void compute_ext8(llr_t* alpha,llr_t* beta,llr_t* m_11,llr_t* m_10,llr_t* ext, l
//int pi2[n],pi3[n+8],pi5[n+8],pi4[n+8],pi6[n+8],
int *pi2tab8[188],*pi5tab8[188],*pi4tab8[188],*pi6tab8[188];
void free_td8(void)
{
void free_td8(void) {
int ind;
for (ind=0; ind<188; ind++) {
......@@ -787,14 +735,11 @@ void free_td8(void)
extern RAN_CONTEXT_t RC;
void init_td8(void)
{
void init_td8(void) {
int ind,i,j,n,n2,pi,pi3;
short * base_interleaver;
short *base_interleaver;
for (ind=0; ind<188; ind++) {
n = f1f2mat[ind].nb_bits;
base_interleaver=il_tb+f1f2mat[ind].beg_index;
#ifdef MEX
......@@ -816,7 +761,6 @@ void init_td8(void)
n2 = n;
for (j=0,i=0; i<n2; i++,j+=16) {
if (j>=n2)
j-=(n2-1);
......@@ -831,7 +775,6 @@ void init_td8(void)
pi5tab8[ind][pi3] = pi2tab8[ind][i];
pi6tab8[ind][pi] = pi2tab8[ind][i];
}
}
}
......@@ -853,31 +796,22 @@ uint8_t phy_threegpplte_turbo_decoder8(int16_t *y,
/* y is a pointer to the input
decoded_bytes is a pointer to the decoded output
n is the size in bits of the coded block, with the tail */
int n2;
llr_t y8[3*(n+16)] __attribute__((aligned(16)));
llr_t systematic0[n+16] __attribute__ ((aligned(16)));
llr_t systematic1[n+16] __attribute__ ((aligned(16)));
llr_t systematic2[n+16] __attribute__ ((aligned(16)));
llr_t yparity1[n+16] __attribute__ ((aligned(16)));
llr_t yparity2[n+16] __attribute__ ((aligned(16)));
llr_t ext[n+128] __attribute__((aligned(16)));
llr_t ext2[n+128] __attribute__((aligned(16)));
llr_t alpha[(n+16)*8] __attribute__ ((aligned(16)));
llr_t beta[(n+16)*8] __attribute__ ((aligned(16)));
llr_t m11[n+16] __attribute__ ((aligned(16)));
llr_t m10[n+16] __attribute__ ((aligned(16)));
// int *pi2_p,*pi4_p,*pi5_p,*pi6_p;
int *pi4_p,*pi5_p,*pi6_p;
llr_t *s,*s1,*s2,*yp1,*yp2,*yp;
unsigned int i,j,iind;//,pi;
unsigned char iteration_cnt=0;
unsigned int crc,oldcrc,crc_len;
......@@ -892,11 +826,9 @@ uint8_t phy_threegpplte_turbo_decoder8(int16_t *y,
int8x16_t tmp, zeros=vdupq_n_s8(0);
const uint8_t __attribute__ ((aligned (16))) _Powers[16]=
{ 1, 2, 4, 8, 16, 32, 64, 128, 1, 2, 4, 8, 16, 32, 64, 128 };
// Set the powers of 2 (do it once for all, if applicable)
uint8x16_t Powers= vld1q_u8(_Powers);
#endif
int offset8_flag=0;
if (crc_type > 3) {
......@@ -904,17 +836,14 @@ uint8_t phy_threegpplte_turbo_decoder8(int16_t *y,
return 255;
}
if (init_stats) start_meas(init_stats);
if ((n&15)>0) {
n2 = n+8;
offset8_flag=1;
} else
n2 = n;
for (iind=0; iind < 188 && f1f2mat[iind].nb_bits != n; iind++);
if ( iind == 188 ) {
......@@ -941,13 +870,12 @@ uint8_t phy_threegpplte_turbo_decoder8(int16_t *y,
}
#if defined(__x86_64__) || defined(__i386__)
// note: this makes valgrind freak
__m128i avg=_mm_set1_epi32(0);
for (i=0; i<(3*(n>>4))+1; i++) {
__m128i tmp=_mm_abs_epi16(_mm_unpackhi_epi16(((__m128i*)y)[i],((__m128i*)y)[i]));
avg=_mm_add_epi32(_mm_cvtepi16_epi32(_mm_abs_epi16(((__m128i*)y)[i])),avg);
__m128i tmp=_mm_abs_epi16(_mm_unpackhi_epi16(((__m128i *)y)[i],((__m128i *)y)[i]));
avg=_mm_add_epi32(_mm_cvtepi16_epi32(_mm_abs_epi16(((__m128i *)y)[i])),avg);
avg=_mm_add_epi32(_mm_cvtepi16_epi32(tmp),avg);
}
......@@ -971,15 +899,13 @@ uint8_t phy_threegpplte_turbo_decoder8(int16_t *y,
for (i=0,j=0; i<(3*(n2>>4))+1; i++,j+=2)
((__m128i *)y8)[i] = _mm_packs_epi16(_mm_srai_epi16(((__m128i *)y)[j],3),_mm_srai_epi16(((__m128i *)y)[j+1],4));
yp128 = (__m128i*)y8;
yp128 = (__m128i *)y8;
#elif defined(__arm__)
int32x4_t avg=vdupq_n_s32(0);
for (i=0; i<(3*(n>>4))+1; i++) {
int16x8_t tmp=vabsq_s16(((int16x8_t*)y)[i]);
avg = vqaddq_s32(avg,vaddl_s16(((int16x4_t*)&tmp)[0],((int16x4_t*)&tmp)[1]));
int16x8_t tmp=vabsq_s16(((int16x8_t *)y)[i]);
avg = vqaddq_s32(avg,vaddl_s16(((int16x4_t *)&tmp)[0],((int16x4_t *)&tmp)[1]));
}
int32_t round_avg=(vgetq_lane_s32(avg,0)+vgetq_lane_s32(avg,1)+vgetq_lane_s32(avg,2)+vgetq_lane_s32(avg,3))/(n*3);
......@@ -999,10 +925,8 @@ uint8_t phy_threegpplte_turbo_decoder8(int16_t *y,
for (i=0,j=0; i<(3*(n2>>3))+1; i++,j+=2)
((int8x8_t *)y8)[i] = vqmovn_s16(vshrq_n_s16(((int16x8_t *)y)[j],3));
yp128 = (int8x16_t*)y8;
yp128 = (int8x16_t *)y8;
#endif
s = systematic0;
s1 = systematic1;
s2 = systematic2;
......@@ -1020,8 +944,7 @@ uint8_t phy_threegpplte_turbo_decoder8(int16_t *y,
}
#endif
yp=(llr_t*)yp128;
yp=(llr_t *)yp128;
if (n2>n) {
/*
......@@ -1031,7 +954,7 @@ uint8_t phy_threegpplte_turbo_decoder8(int16_t *y,
s1[n+4]=0;s1[n+5]=0;s1[n+6]=0;s1[n+7]=0;
s2[n]=0;s2[n+1]=0;s2[n+2]=0;s2[n+3]=0;
s2[n+4]=0;s2[n+5]=0;s2[n+6]=0;s2[n+7]=0;*/
yp=(llr_t*)(y8+n);
yp=(llr_t *)(y8+n);
}
// printf("n=%d,n2=%d\n",n,n2);
......@@ -1045,7 +968,7 @@ uint8_t phy_threegpplte_turbo_decoder8(int16_t *y,
yp1[i] = *yp;
yp++;
#ifdef DEBUG_LOGMAP
printf("Term 1 (%d): %d %d\n",i,s[i],yp1[i]);
printf("Term 1 (%u): %d %d\n",i,s[i],yp1[i]);
#endif //DEBUG_LOGMAP
}
......@@ -1057,7 +980,7 @@ uint8_t phy_threegpplte_turbo_decoder8(int16_t *y,
yp2[i-16] = *yp;
yp++;
#ifdef DEBUG_LOGMAP
printf("Term 2 (%d): %d %d\n",i-16,s[i],yp2[i-16]);
printf("Term 2 (%u): %d %d\n",i-16,s[i],yp2[i-16]);
#endif //DEBUG_LOGMAP
}
......@@ -1068,63 +991,59 @@ uint8_t phy_threegpplte_turbo_decoder8(int16_t *y,
if (init_stats) stop_meas(init_stats);
// do log_map from first parity bit
log_map8(systematic0,yparity1,m11,m10,alpha,beta,ext,n2,0,F,offset8_flag,alpha_stats,beta_stats,gamma_stats,ext_stats);
while (iteration_cnt++ < max_iterations) {
while (iteration_cnt++ < max_iterations) {
#ifdef DEBUG_LOGMAP
printf("\n*******************ITERATION %d (n %d, n2 %d), ext %p\n\n",iteration_cnt,n,n2,ext);
#endif //DEBUG_LOGMAP
if (intl1_stats) start_meas(intl1_stats);
pi4_p=pi4tab8[iind];
for (i=0; i<(n2>>4); i++) { // steady-state portion
#if defined(__x86_64__) || defined(__i386__)
tmp=_mm_insert_epi8(tmp,((llr_t*)ext)[*pi4_p++],0);
tmp=_mm_insert_epi8(tmp,((llr_t*)ext)[*pi4_p++],1);
tmp=_mm_insert_epi8(tmp,((llr_t*)ext)[*pi4_p++],2);
tmp=_mm_insert_epi8(tmp,((llr_t*)ext)[*pi4_p++],3);
tmp=_mm_insert_epi8(tmp,((llr_t*)ext)[*pi4_p++],4);
tmp=_mm_insert_epi8(tmp,((llr_t*)ext)[*pi4_p++],5);
tmp=_mm_insert_epi8(tmp,((llr_t*)ext)[*pi4_p++],6);
tmp=_mm_insert_epi8(tmp,((llr_t*)ext)[*pi4_p++],7);
tmp=_mm_insert_epi8(tmp,((llr_t*)ext)[*pi4_p++],8);
tmp=_mm_insert_epi8(tmp,((llr_t*)ext)[*pi4_p++],9);
tmp=_mm_insert_epi8(tmp,((llr_t*)ext)[*pi4_p++],10);
tmp=_mm_insert_epi8(tmp,((llr_t*)ext)[*pi4_p++],11);
tmp=_mm_insert_epi8(tmp,((llr_t*)ext)[*pi4_p++],12);
tmp=_mm_insert_epi8(tmp,((llr_t*)ext)[*pi4_p++],13);
tmp=_mm_insert_epi8(tmp,((llr_t*)ext)[*pi4_p++],14);
((__m128i *)systematic2)[i]=_mm_insert_epi8(tmp,((llr_t*)ext)[*pi4_p++],15);
tmp=_mm_insert_epi8(tmp,((llr_t *)ext)[*pi4_p++],0);
tmp=_mm_insert_epi8(tmp,((llr_t *)ext)[*pi4_p++],1);
tmp=_mm_insert_epi8(tmp,((llr_t *)ext)[*pi4_p++],2);
tmp=_mm_insert_epi8(tmp,((llr_t *)ext)[*pi4_p++],3);
tmp=_mm_insert_epi8(tmp,((llr_t *)ext)[*pi4_p++],4);
tmp=_mm_insert_epi8(tmp,((llr_t *)ext)[*pi4_p++],5);
tmp=_mm_insert_epi8(tmp,((llr_t *)ext)[*pi4_p++],6);
tmp=_mm_insert_epi8(tmp,((llr_t *)ext)[*pi4_p++],7);
tmp=_mm_insert_epi8(tmp,((llr_t *)ext)[*pi4_p++],8);
tmp=_mm_insert_epi8(tmp,((llr_t *)ext)[*pi4_p++],9);
tmp=_mm_insert_epi8(tmp,((llr_t *)ext)[*pi4_p++],10);
tmp=_mm_insert_epi8(tmp,((llr_t *)ext)[*pi4_p++],11);
tmp=_mm_insert_epi8(tmp,((llr_t *)ext)[*pi4_p++],12);
tmp=_mm_insert_epi8(tmp,((llr_t *)ext)[*pi4_p++],13);
tmp=_mm_insert_epi8(tmp,((llr_t *)ext)[*pi4_p++],14);
((__m128i *)systematic2)[i]=_mm_insert_epi8(tmp,((llr_t *)ext)[*pi4_p++],15);
#elif defined(__arm__)
tmp=vsetq_lane_s8(((llr_t*)ext)[*pi4_p++],tmp,0);
tmp=vsetq_lane_s8(((llr_t*)ext)[*pi4_p++],tmp,1);
tmp=vsetq_lane_s8(((llr_t*)ext)[*pi4_p++],tmp,2);
tmp=vsetq_lane_s8(((llr_t*)ext)[*pi4_p++],tmp,3);
tmp=vsetq_lane_s8(((llr_t*)ext)[*pi4_p++],tmp,4);
tmp=vsetq_lane_s8(((llr_t*)ext)[*pi4_p++],tmp,5);
tmp=vsetq_lane_s8(((llr_t*)ext)[*pi4_p++],tmp,6);
tmp=vsetq_lane_s8(((llr_t*)ext)[*pi4_p++],tmp,7);
tmp=vsetq_lane_s8(((llr_t*)ext)[*pi4_p++],tmp,8);
tmp=vsetq_lane_s8(((llr_t*)ext)[*pi4_p++],tmp,9);
tmp=vsetq_lane_s8(((llr_t*)ext)[*pi4_p++],tmp,10);
tmp=vsetq_lane_s8(((llr_t*)ext)[*pi4_p++],tmp,11);
tmp=vsetq_lane_s8(((llr_t*)ext)[*pi4_p++],tmp,12);
tmp=vsetq_lane_s8(((llr_t*)ext)[*pi4_p++],tmp,13);
tmp=vsetq_lane_s8(((llr_t*)ext)[*pi4_p++],tmp,14);
((int8x16_t *)systematic2)[i]=vsetq_lane_s8(((llr_t*)ext)[*pi4_p++],tmp,15);
tmp=vsetq_lane_s8(((llr_t *)ext)[*pi4_p++],tmp,0);
tmp=vsetq_lane_s8(((llr_t *)ext)[*pi4_p++],tmp,1);
tmp=vsetq_lane_s8(((llr_t *)ext)[*pi4_p++],tmp,2);
tmp=vsetq_lane_s8(((llr_t *)ext)[*pi4_p++],tmp,3);
tmp=vsetq_lane_s8(((llr_t *)ext)[*pi4_p++],tmp,4);
tmp=vsetq_lane_s8(((llr_t *)ext)[*pi4_p++],tmp,5);
tmp=vsetq_lane_s8(((llr_t *)ext)[*pi4_p++],tmp,6);
tmp=vsetq_lane_s8(((llr_t *)ext)[*pi4_p++],tmp,7);
tmp=vsetq_lane_s8(((llr_t *)ext)[*pi4_p++],tmp,8);
tmp=vsetq_lane_s8(((llr_t *)ext)[*pi4_p++],tmp,9);
tmp=vsetq_lane_s8(((llr_t *)ext)[*pi4_p++],tmp,10);
tmp=vsetq_lane_s8(((llr_t *)ext)[*pi4_p++],tmp,11);
tmp=vsetq_lane_s8(((llr_t *)ext)[*pi4_p++],tmp,12);
tmp=vsetq_lane_s8(((llr_t *)ext)[*pi4_p++],tmp,13);
tmp=vsetq_lane_s8(((llr_t *)ext)[*pi4_p++],tmp,14);
((int8x16_t *)systematic2)[i]=vsetq_lane_s8(((llr_t *)ext)[*pi4_p++],tmp,15);
#endif
}
if (intl1_stats) stop_meas(intl1_stats);
// do log_map from second parity bit
log_map8(systematic2,yparity2,m11,m10,alpha,beta,ext2,n2,1,F,offset8_flag,alpha_stats,beta_stats,gamma_stats,ext_stats);
pi5_p=pi5tab8[iind];
uint16_t decoded_bytes_interl[6144/16] __attribute__((aligned(16)));
......@@ -1148,7 +1067,7 @@ uint8_t phy_threegpplte_turbo_decoder8(int16_t *y,
tmp=_mm_insert_epi8(tmp,ext2[*pi5_p++],14);
tmp=_mm_insert_epi8(tmp,ext2[*pi5_p++],15);
decoded_bytes_interl[i]=(uint16_t) _mm_movemask_epi8(_mm_cmpgt_epi8(tmp,zeros));
((__m128i *)systematic1)[i] = _mm_adds_epi8(_mm_subs_epi8(tmp,((__m128i*)ext)[i]),((__m128i *)systematic0)[i]);
((__m128i *)systematic1)[i] = _mm_adds_epi8(_mm_subs_epi8(tmp,((__m128i *)ext)[i]),((__m128i *)systematic0)[i]);
#elif defined(__arm__)
tmp=vsetq_lane_s8(ext2[*pi5_p++],tmp,0);
tmp=vsetq_lane_s8(ext2[*pi5_p++],tmp,1);
......@@ -1167,12 +1086,11 @@ uint8_t phy_threegpplte_turbo_decoder8(int16_t *y,
tmp=vsetq_lane_s8(ext2[*pi5_p++],tmp,14);
tmp=vsetq_lane_s8(ext2[*pi5_p++],tmp,15);
uint64x2_t Mask= vpaddlq_u32(vpaddlq_u16(vpaddlq_u8(vandq_u8(vcgtq_s8(tmp,zeros), Powers))));
vst1q_lane_u8(&((uint8_t*)&decoded_bytes[i])[0], (uint8x16_t)Mask, 0);
vst1q_lane_u8(&((uint8_t*)&decoded_bytes[i])[1], (uint8x16_t)Mask, 8);
((int8x16_t *)systematic1)[i] = vqaddq_s8(vqsubq_s8(tmp,((int8x16_t*)ext)[i]),((int8x16_t *)systematic0)[i]);
vst1q_lane_u8(&((uint8_t *)&decoded_bytes[i])[0], (uint8x16_t)Mask, 0);
vst1q_lane_u8(&((uint8_t *)&decoded_bytes[i])[1], (uint8x16_t)Mask, 8);
((int8x16_t *)systematic1)[i] = vqaddq_s8(vqsubq_s8(tmp,((int8x16_t *)ext)[i]),((int8x16_t *)systematic0)[i]);
#endif
}
} else {
for (i=0; i<(n2>>4); i++) {
#if defined(__x86_64__) || defined(__i386__)
......@@ -1193,8 +1111,7 @@ uint8_t phy_threegpplte_turbo_decoder8(int16_t *y,
tmp=_mm_insert_epi8(tmp,ext2[*pi5_p++],14);
tmp=_mm_insert_epi8(tmp,ext2[*pi5_p++],15);
tmp128[i] = _mm_adds_epi8(((__m128i *)ext2)[i],((__m128i *)systematic2)[i]);
((__m128i *)systematic1)[i] = _mm_adds_epi8(_mm_subs_epi8(tmp,((__m128i*)ext)[i]),((__m128i *)systematic0)[i]);
((__m128i *)systematic1)[i] = _mm_adds_epi8(_mm_subs_epi8(tmp,((__m128i *)ext)[i]),((__m128i *)systematic0)[i]);
#elif defined(__arm__)
tmp=vsetq_lane_s8(ext2[*pi5_p++],tmp,0);
tmp=vsetq_lane_s8(ext2[*pi5_p++],tmp,1);
......@@ -1213,9 +1130,7 @@ uint8_t phy_threegpplte_turbo_decoder8(int16_t *y,
tmp=vsetq_lane_s8(ext2[*pi5_p++],tmp,14);
tmp=vsetq_lane_s8(ext2[*pi5_p++],tmp,15);
tmp128[i] = vqaddq_s8(((int8x16_t *)ext2)[i],((int8x16_t *)systematic2)[i]);
((int8x16_t *)systematic1)[i] = vqaddq_s8(vqsubq_s8(tmp,((int8x16_t*)ext)[i]),((int8x16_t *)systematic0)[i]);
((int8x16_t *)systematic1)[i] = vqaddq_s8(vqsubq_s8(tmp,((int8x16_t *)ext)[i]),((int8x16_t *)systematic0)[i]);
#endif
}
}
......@@ -1225,11 +1140,10 @@ uint8_t phy_threegpplte_turbo_decoder8(int16_t *y,
if (intl2_stats) start_meas(intl2_stats);
if ((n2&0x7f) == 0) { // n2 is a multiple of 128 bits
// re-order the decoded bits in theregular order
// as it is presently ordered as 16 sequential columns
#if defined(__x86_64__) || defined(__i386__)
__m128i* dbytes=(__m128i*)decoded_bytes_interl;
__m128i *dbytes=(__m128i *)decoded_bytes_interl;
__m128i shuffle=SHUFFLE16(7,6,5,4,3,2,1,0);
__m128i mask __attribute__((aligned(16)));
int n_128=n2>>7;
......@@ -1239,7 +1153,6 @@ uint8_t phy_threegpplte_turbo_decoder8(int16_t *y,
__m128i tmp __attribute__((aligned(16)));
tmp=_mm_shuffle_epi8(dbytes[i],shuffle);
__m128i tmp2 __attribute__((aligned(16))) ;
tmp2=_mm_and_si128(tmp,mask);
tmp2=_mm_cmpeq_epi16(tmp2,mask);
// printf("decoded_bytes %p\n",decoded_bytes);
......@@ -1253,17 +1166,17 @@ uint8_t phy_threegpplte_turbo_decoder8(int16_t *y,
decoded_bytes[n_128*j +i]=(uint8_t) _mm_movemask_epi8(_mm_packs_epi16(tmp2,zeros));
}
}
#elif defined(__arm__)
uint8x16_t* dbytes=(uint8x16_t*)decoded_bytes_interl;
uint8x16_t *dbytes=(uint8x16_t *)decoded_bytes_interl;
uint16x8_t mask __attribute__((aligned(16)));
int n_128=n2>>7;
for (i=0; i<n_128; i++) {
mask=vdupq_n_u16(1);
uint8x16_t tmp __attribute__((aligned(16)));
tmp=vcombine_u8(vrev64_u8(((uint8x8_t*)&dbytes[i])[1]),vrev64_u8(((uint8x8_t*)&dbytes[i])[0]));
tmp=vcombine_u8(vrev64_u8(((uint8x8_t *)&dbytes[i])[1]),vrev64_u8(((uint8x8_t *)&dbytes[i])[0]));
vst1q_lane_u8(&decoded_bytes[n_128*0+i],(uint8x16_t)vpaddlq_u32(vpaddlq_u16(vpaddlq_u8(vandq_u8(tmp, Powers)))),0);
int j;
for (j=1; j<16; j++) {
......@@ -1314,8 +1227,8 @@ uint8_t phy_threegpplte_turbo_decoder8(int16_t *y,
tmp=vsetq_lane_s8(((llr_t *)tmp128)[*pi6_p++],tmp,9);
tmp=vsetq_lane_s8(((llr_t *)tmp128)[*pi6_p++],tmp,8);
uint64x2_t Mask= vpaddlq_u32(vpaddlq_u16(vpaddlq_u8(vandq_u8(vcgtq_s8(tmp,zeros), Powers))));
vst1q_lane_u8(&((uint8_t*)&decoded_bytes[i])[0], (uint8x16_t)Mask, 0);
vst1q_lane_u8(&((uint8_t*)&decoded_bytes[i])[1], (uint8x16_t)Mask, 8);
vst1q_lane_u8(&((uint8_t *)&decoded_bytes[i])[0], (uint8x16_t)Mask, 0);
vst1q_lane_u8(&((uint8_t *)&decoded_bytes[i])[1], (uint8x16_t)Mask, 8);
#endif
}
}
......@@ -1324,7 +1237,6 @@ uint8_t phy_threegpplte_turbo_decoder8(int16_t *y,
oldcrc= *((unsigned int *)(&decoded_bytes[(n>>3)-crc_len]));
switch (crc_type) {
case CRC24_A:
oldcrc&=0x00ffffff;
crc = crc24a(&decoded_bytes[F>>3],
......@@ -1372,13 +1284,13 @@ uint8_t phy_threegpplte_turbo_decoder8(int16_t *y,
if (iteration_cnt < max_iterations) {
log_map8(systematic1,yparity1,m11,m10,alpha,beta,ext,n2,0,F,offset8_flag,alpha_stats,beta_stats,gamma_stats,ext_stats);
#if defined(__x86_64__) || defined(__i386__)
__m128i* ext_128=(__m128i*) ext;
__m128i* s1_128=(__m128i*) systematic1;
__m128i* s0_128=(__m128i*) systematic0;
__m128i *ext_128=(__m128i *) ext;
__m128i *s1_128=(__m128i *) systematic1;
__m128i *s0_128=(__m128i *) systematic0;
#elif defined(__arm__)
int8x16_t* ext_128=(int8x16_t*) ext;
int8x16_t* s1_128=(int8x16_t*) systematic1;
int8x16_t* s0_128=(int8x16_t*) systematic0;
int8x16_t *ext_128=(int8x16_t *) ext;
int8x16_t *s1_128=(int8x16_t *) systematic1;
int8x16_t *s0_128=(int8x16_t *) systematic0;
#endif
int myloop=n2>>4;
......@@ -1394,5 +1306,4 @@ uint8_t phy_threegpplte_turbo_decoder8(int16_t *y,
}
return(iteration_cnt);
}
......@@ -47,27 +47,20 @@ void
ccodedot11_encode (unsigned int numbytes,
unsigned char *inPtr,
unsigned char *outPtr,
unsigned char puncturing)
{
unsigned char puncturing) {
unsigned int state;
unsigned char c, out, shiftbit =0;
// printf("In ccodedot11_encode (%d,%p,%p,%d)\n",numbytes,inPtr,outPtr,puncturing);
#ifdef DEBUG_CCODE
unsigned int dummy;
#endif //DEBUG_CCODE
int bit_index;
/* The input bit is shifted in position 8 of the state.
Shiftbit will take values between 1 and 8 */
state = 0;
#ifdef DEBUG_CCODE
dummy = 0;
#endif //DEBUG_CCODE
/* Do not increment inPtr until we read the next octet */
bit_index=0;
......@@ -80,7 +73,6 @@ ccodedot11_encode (unsigned int numbytes,
switch (puncturing) {
case 0: //rate 1/2
for (shiftbit = 0; shiftbit<8; shiftbit++) {
state >>= 1;
if ((c&(1<<shiftbit)) != 0) {
......@@ -88,22 +80,18 @@ ccodedot11_encode (unsigned int numbytes,
}
out = ccodedot11_table[state];
*outPtr++ = out & 1;
*outPtr++ = (out>>1)&1;
#ifdef DEBUG_CCODE
printf("%d: %d -> %d (%d)\n",dummy,state,out,ccodedot11_table[state]);
printf("%u: %u -> %d (%u)\n",dummy,state,out,ccodedot11_table[state]);
dummy+=2;
#endif //DEBUG_CCODE
}
break;
case 1: // rate 3/4
for (shiftbit = 0; shiftbit<8; shiftbit++) {
state >>= 1;
if ((c&(1<<shiftbit)) != 0) {
......@@ -119,10 +107,9 @@ ccodedot11_encode (unsigned int numbytes,
*outPtr++ = (out>>1)&1;
#ifdef DEBUG_CCODE
printf("%d: %d -> %d (%d)\n",dummy,state,out,ccodedot11_table[state]);
printf("%u: %u -> %d (%u)\n",dummy,state,out,ccodedot11_table[state]);
dummy+=2;
#endif //DEBUG_CCODE
bit_index=(bit_index==2)?0:(bit_index+1);
}
......@@ -130,7 +117,6 @@ ccodedot11_encode (unsigned int numbytes,
case 2: // rate 2/3
for (shiftbit = 0; shiftbit<8; shiftbit++) {
state >>= 1;
if ((c&(1<<shiftbit)) != 0) {
......@@ -138,19 +124,16 @@ ccodedot11_encode (unsigned int numbytes,
}
out = ccodedot11_table[state];
*outPtr++ = out & 1;
if (bit_index==0)
*outPtr++ = (out>>1)&1;
#ifdef DEBUG_CCODE
printf("%d: %d -> %d (%d)\n",dummy,state,out,ccodedot11_table[state]);
printf("%d: %u -> %d (%u)\n",dummy,state,out,ccodedot11_table[state]);
dummy+=2;
#endif //DEBUG_CCODE
bit_index=(bit_index==0)?1:0;
}
break;
......@@ -181,8 +164,6 @@ ccodedot11_encode (unsigned int numbytes,
}
*/
}
......@@ -197,8 +178,7 @@ ccodedot11_encode (unsigned int numbytes,
/* Basic code table initialization for constraint length 7 */
/* Input in MSB, followed by state in 6 LSBs */
void ccodedot11_init(void)
{
void ccodedot11_init(void) {
unsigned int i, j, k, sum;
for (i = 0; i < 128; i++) {
......@@ -219,8 +199,7 @@ void ccodedot11_init(void)
}
/* Input in LSB, followed by state in 6 MSBs */
void ccodedot11_init_inv(void)
{
void ccodedot11_init_inv(void) {
unsigned int i, j, k, sum;
for (i = 0; i < 128; i++) {
......@@ -251,21 +230,15 @@ void ccodedot11_init_inv(void)
#ifdef DEBUG_CCODE
#include <stdio.h>
main()
{
main() {
unsigned char test[] = "0Thebigredfox";
unsigned char output[512], *inPtr, *outPtr;
unsigned int i;
test[0] = 128;
test[1] = 0;
ccodedot11_init();
inPtr = test;
outPtr = output;
ccodedot11_encode(16, inPtr, outPtr,0);
for (i = 0; i < 32; i++) printf("%x ", output[i]);
......
......@@ -52,20 +52,16 @@ ccodelte_encode (int32_t numbits,
uint8_t add_crc,
uint8_t *inPtr,
uint8_t *outPtr,
uint16_t rnti)
{
uint16_t rnti) {
uint32_t state;
uint8_t c, out, first_bit;
int8_t shiftbit=0;
uint16_t c16;
uint16_t next_last_byte=0;
uint32_t crc=0;
#ifdef DEBUG_CCODE
uint32_t dummy=0;
#endif //DEBUG_CCODE
/* The input bit is shifted in position 8 of the state.
Shiftbit will take values between 1 and 8 */
state = 0;
......@@ -137,17 +133,12 @@ ccodelte_encode (int32_t numbits,
#endif //DEBUG_CCODE
/* Do not increment inPtr until we read the next octet */
while (numbits > 0) {
c = *inPtr++;
#ifdef DEBUG_CCODE
printf("** %x **\n",c);
#endif //DEBUG_CCODE
// for (shiftbit = 0; (shiftbit<8) && (numbits>0);shiftbit++,numbits--) {
for (shiftbit = 7; (shiftbit>=0) && (numbits>0); shiftbit--,numbits--) {
state >>= 1;
......@@ -157,23 +148,18 @@ ccodelte_encode (int32_t numbits,
}
out = ccodelte_table[state];
*outPtr++ = out & 1;
*outPtr++ = (out>>1)&1;
*outPtr++ = (out>>2)&1;
#ifdef DEBUG_CCODE
printf("numbits %d, input %d, outbit %d: %d -> %d (%d%d%d)\n",numbits,state>>6,dummy,state,out,out&1,(out>>1)&1,(out>>2)&1);
dummy+=3;
#endif //DEBUG_CCODE
}
}
// now code 8-bit CRC for UCI
if (add_crc == 1) {
c = (uint8_t)(crc>>24);
// for (shiftbit = 0; (shiftbit<8);shiftbit++) {
......@@ -185,22 +171,18 @@ ccodelte_encode (int32_t numbits,
}
out = ccodelte_table[state];
*outPtr++ = out & 1;
*outPtr++ = (out>>1)&1;
*outPtr++ = (out>>2)&1;
#ifdef DEBUG_CCODE
printf("crc bit %d input %d, outbit %d: %d -> %d (%d)\n",shiftbit,state>>6,dummy,state,out,ccodelte_table[state]);
printf("crc bit %d input %d, outbit %d: %d -> %d (%u)\n",shiftbit,state>>6,dummy,state,out,ccodelte_table[state]);
dummy+=3;
#endif //DEBUG_CCODE
}
}
// now code 16-bit CRC for DCI
if (add_crc == 2) {
c16 = (uint16_t)(crc>>16);
// for (shiftbit = 0; (shiftbit<16);shiftbit++) {
......@@ -212,16 +194,13 @@ ccodelte_encode (int32_t numbits,
}
out = ccodelte_table[state];
*outPtr++ = out & 1;
*outPtr++ = (out>>1)&1;
*outPtr++ = (out>>2)&1;
#ifdef DEBUG_CCODE
printf("crc bit %d input %d, outbit %d: %d -> %d (%d)\n",shiftbit,state>>6,dummy,state,out,ccodelte_table[state]);
printf("crc bit %d input %d, outbit %d: %d -> %d (%u)\n",shiftbit,state>>6,dummy,state,out,ccodelte_table[state]);
dummy+=3;
#endif //DEBUG_CCODE
}
}
}
......@@ -238,8 +217,7 @@ ccodelte_encode (int32_t numbits,
/* Basic code table initialization for constraint length 7 */
/* Input in MSB, followed by state in 6 LSBs */
void ccodelte_init(void)
{
void ccodelte_init(void) {
unsigned int i, j, k, sum;
for (i = 0; i < 128; i++) {
......@@ -260,8 +238,7 @@ void ccodelte_init(void)
}
/* Input in LSB, followed by state in 6 MSBs */
void ccodelte_init_inv(void)
{
void ccodelte_init_inv(void) {
unsigned int i, j, k, sum;
for (i = 0; i < 128; i++) {
......@@ -281,8 +258,7 @@ void ccodelte_init_inv(void)
}
}
void ccodedab_init(void)
{
void ccodedab_init(void) {
unsigned int i, j, k, sum;
for (i = 0; i < 128; i++) {
......@@ -303,8 +279,7 @@ void ccodedab_init(void)
}
/* Input in LSB, followed by state in 6 MSBs */
void ccodedab_init_inv(void)
{
void ccodedab_init_inv(void) {
unsigned int i, j, k, sum;
for (i = 0; i < 128; i++) {
......@@ -334,21 +309,15 @@ void ccodedab_init_inv(void)
#ifdef CCODE_MAIN
#include <stdio.h>
main()
{
main() {
unsigned char test[] = "Thebigredfox";
unsigned char output[512], *inPtr, *outPtr;
unsigned int i;
test[0] = 128;
test[1] = 0;
ccodelte_init();
inPtr = test;
outPtr = output;
ccodelte_encode(21, inPtr, outPtr);
for (i = 0; i < 21*3; i++) printf("%x ", output[i]);
......
......@@ -25,8 +25,8 @@
date: 21.10.2009
*/
#ifdef MAIN
#include <stdio.h>
#include <stdlib.h>
#include <stdio.h>
#include <stdlib.h>
#endif
#include "PHY/defs_eNB.h"
#include "PHY/LTE_TRANSPORT/transport_common.h"
......@@ -42,9 +42,7 @@ static uint32_t bitrev_cc[32] = {1,17,9,25,5,21,13,29,3,19,11,27,7,23,15,31,0,16
//#define RM_DEBUG2 1
//#define RM_DEBUG_CC 1
uint32_t sub_block_interleaving_turbo(uint32_t D, uint8_t *d,uint8_t *w)
{
uint32_t sub_block_interleaving_turbo(uint32_t D, uint8_t *d,uint8_t *w) {
uint32_t RTC = (D>>5), ND, ND3;
uint32_t row,col,Kpi;
uint32_t index3,k,k2;
......@@ -64,7 +62,6 @@ uint32_t sub_block_interleaving_turbo(uint32_t D, uint8_t *d,uint8_t *w)
printf("RTC = %d, Kpi=%d, ND=%d\n",RTC,Kpi,ND);
#endif
ND3 = ND*3;
// copy d02 to dD2 (for mod Kpi operation from clause (4), p.16 of 36.212
d[(3*D)+2] = d[2];
k=0;
......@@ -80,12 +77,9 @@ uint32_t sub_block_interleaving_turbo(uint32_t D, uint8_t *d,uint8_t *w)
index3 = bitrev_x3[col];//3*index;
for (row=0; row<RTC; row++) {
w[k] = d1[index3];//d[index3-ND3];
w[Kpi+k2] = d2[index3];//d[index3-ND3+1];
w[Kpi+1+k2] = d3[index3];//d[index3-ND3+5];
#ifdef RM_DEBUG
printf("row %d, index %d, index-Nd %d index-Nd+1 %d (k,Kpi+2k,Kpi+2k+1) (%d,%d,%d) w(%d,%d,%d)\n",row,index,index-ND,((index+1)%Kpi)-ND,k,Kpi+(k<<1),Kpi+(k<<1)+1,w[k],w[Kpi+(k<<1)],w[Kpi+1+(k<<1)]);
......@@ -100,7 +94,8 @@ uint32_t sub_block_interleaving_turbo(uint32_t D, uint8_t *d,uint8_t *w)
#endif
index3+=96;
k++;k2+=2;
k++;
k2+=2;
}
}
......@@ -120,9 +115,7 @@ uint32_t sub_block_interleaving_turbo(uint32_t D, uint8_t *d,uint8_t *w)
}
uint32_t sub_block_interleaving_cc(uint32_t D, uint8_t *d,uint8_t *w)
{
uint32_t sub_block_interleaving_cc(uint32_t D, uint8_t *d,uint8_t *w) {
uint32_t RCC = (D>>5), ND, ND3;
uint32_t row,col,Kpi,index;
uint32_t index3,k;
......@@ -141,7 +134,6 @@ uint32_t sub_block_interleaving_cc(uint32_t D, uint8_t *d,uint8_t *w)
printf("RCC = %d, Kpi=%d, ND=%d\n",RCC,Kpi,ND);
#endif
ND3 = ND*3;
k=0;
for (col=0; col<32; col++) {
......@@ -180,9 +172,7 @@ uint32_t sub_block_interleaving_cc(uint32_t D, uint8_t *d,uint8_t *w)
return(RCC);
}
void sub_block_deinterleaving_turbo(uint32_t D,int16_t *d,int16_t *w)
{
void sub_block_deinterleaving_turbo(uint32_t D,int16_t *d,int16_t *w) {
uint32_t RTC = (D>>5), ND, ND3;
uint32_t row,col,Kpi,index;
uint32_t index3,k,k2;
......@@ -199,7 +189,6 @@ void sub_block_deinterleaving_turbo(uint32_t D,int16_t *d,int16_t *w)
printf("RTC = %d, Kpi=%d, ND=%d\n",RTC,Kpi,ND);
#endif
ND3 = ND*3;
// copy d02 to dD2 (for mod Kpi operation from clause (4), p.16 of 36.212
k=0;
k2=0;
......@@ -215,7 +204,6 @@ void sub_block_deinterleaving_turbo(uint32_t D,int16_t *d,int16_t *w)
index3 = bitrev_x3[col];//3*index;
for (row=0; row<RTC; row++) {
d1[index3] = w[k];
d2[index3] = w[Kpi+k2];
d3[index3] = w[Kpi+1+k2];
......@@ -229,12 +217,9 @@ void sub_block_deinterleaving_turbo(uint32_t D,int16_t *d,int16_t *w)
// if (ND>0)
// d[2] = LTE_NULL;//d[(3*D)+2];
}
void sub_block_deinterleaving_cc(uint32_t D,int8_t *d,int8_t *w)
{
void sub_block_deinterleaving_cc(uint32_t D,int8_t *d,int8_t *w) {
//WANG_Hao uint32_t RCC = (D>>5), ND, ND3;
uint32_t RCC = (D>>5);
ptrdiff_t ND, ND3;
......@@ -251,10 +236,9 @@ void sub_block_deinterleaving_cc(uint32_t D,int8_t *d,int8_t *w)
ND = Kpi - D;
#ifdef RM_DEBUG2
printf("sub_block_interleaving_cc : D = %d (%d), d %p, w %p\n",D,D*3,d,w);
printf("RCC = %d, Kpi=%d, ND=%d\n",RCC,Kpi,ND);
printf("RCC = %d, Kpi=%d, ND=%ld\n",RCC,Kpi,ND);
#endif
ND3 = ND*3;
k=0;
for (col=0; col<32; col++) {
......@@ -265,24 +249,20 @@ void sub_block_deinterleaving_cc(uint32_t D,int8_t *d,int8_t *w)
index3 = 3*index;
for (row=0; row<RCC; row++) {
d[index3-ND3] = w[k];
d[index3-ND3+1] = w[Kpi+k];
d[index3-ND3+2] = w[(Kpi<<1)+k];
#ifdef RM_DEBUG2
printf("row %d, index %d k %d index3-ND3 %d w(%d,%d,%d)\n",row,index,k,index3-ND3,w[k],w[Kpi+k],w[(Kpi<<1)+k]);
printf("row %d, index %d k %d index3-ND3 %ld w(%d,%d,%d)\n",row,index,k,index3-ND3,w[k],w[Kpi+k],w[(Kpi<<1)+k]);
#endif
index3+=96;
index+=32;
k++;
}
}
}
uint32_t generate_dummy_w(uint32_t D, uint8_t *w,uint8_t F)
{
uint32_t generate_dummy_w(uint32_t D, uint8_t *w,uint8_t F) {
uint32_t RTC = (D>>5), ND;
uint32_t col,Kpi,index;
int32_t k,k2;
......@@ -301,8 +281,6 @@ uint32_t generate_dummy_w(uint32_t D, uint8_t *w,uint8_t F)
printf("dummy sub_block_interleaving_turbo : D = %d (%d)\n",D,D*3);
printf("RTC = %d, Kpi=%d, ND=%d, F=%d (Nulled %d)\n",RTC,Kpi,ND,F,(2*F + 3*ND));
#endif
k=0;
k2=0;
wKpi = &w[Kpi];
......@@ -371,9 +349,7 @@ uint32_t generate_dummy_w(uint32_t D, uint8_t *w,uint8_t F)
return(RTC);
}
uint32_t generate_dummy_w_cc(uint32_t D, uint8_t *w)
{
uint32_t generate_dummy_w_cc(uint32_t D, uint8_t *w) {
uint32_t RCC = (D>>5), ND;
uint32_t col,Kpi,index;
int32_t k;
......@@ -392,7 +368,6 @@ uint32_t generate_dummy_w_cc(uint32_t D, uint8_t *w)
printf("RCC = %d, Kpi=%d, ND=%d, (Nulled %d)\n",RCC,Kpi,ND,3*ND);
#endif
// ND3 = ND*3;
// copy d02 to dD2 (for mod Kpi operation from clause (4), p.16 of 36.212
k=0;
......@@ -466,8 +441,6 @@ uint32_t lte_rate_matching_turbo(uint32_t RTC,
uint8_t nb_rb)
// uint8_t m)
{
uint32_t Nir,Ncb,Gp,GpmodC,E,Ncbmod,ind,k;
// int cnt=0;
uint8_t *e2;
......@@ -487,11 +460,11 @@ uint32_t lte_rate_matching_turbo(uint32_t RTC,
if (Mdlharq>0) { // Downlink
Nir = Nsoft/Kmimo/cmin(8,Mdlharq);
Ncb = cmin(Nir/C,3*(RTC<<5));
}
else { // Uplink
} else { // Uplink
Nir=0;
Ncb = 3*(RTC<<5); // Kw
}
#ifdef RM_DEBUG_TX
if (rvidx==0 && r==0) {
......@@ -518,7 +491,6 @@ uint32_t lte_rate_matching_turbo(uint32_t RTC,
AssertFatal(Qm>0,"Qm is 0\n");
Gp = G/Nl/Qm;
GpmodC = Gp%C;
#ifdef RM_DEBUG
printf("lte_rate_matching_turbo: Ncb %d, Kw %d, Nir/C %d, rvidx %d, G %d, Qm %d, Nl%d, r %d\n",Ncb,3*(RTC<<5),Nir/C,rvidx, G, Qm,Nl,r);
#endif
......@@ -529,16 +501,12 @@ uint32_t lte_rate_matching_turbo(uint32_t RTC,
E = Nl*Qm * ((GpmodC==0?0:1) + (Gp/C));
Ncbmod = Ncb%(RTC<<3);
ind = RTC * (2+(rvidx*(((Ncbmod==0)?0:1) + (Ncb/(RTC<<3)))*2));
#ifdef RM_DEBUG_TX
printf("lte_rate_matching_turbo: E %d, k0 %d, Ncbmod %d, Ncb/(RTC<<3) %d\n",E,ind,Ncbmod,Ncb/(RTC<<3));
#endif
//e2=e+(r*E);
e2 = e;
k=0;
for (; (ind<Ncb)&&(k<E); ind++) {
......@@ -633,25 +601,16 @@ uint32_t lte_rate_matching_turbo(uint32_t RTC,
uint32_t lte_rate_matching_cc(uint32_t RCC,
uint16_t E,
uint8_t *w,
uint8_t *e)
{
uint8_t *e) {
uint32_t ind=0,k;
uint16_t Kw = 3*(RCC<<5);
#ifdef RM_DEBUG_CC
uint32_t nulled=0;
printf("lte_rate_matching_cc: Kw %d, E %d\n",Kw, E);
#endif
for (k=0; k<E; k++) {
while(w[ind] == LTE_NULL) {
#ifdef RM_DEBUG_CC
nulled++;
printf("RM_TX_CC : ind %d, NULL\n",ind);
......@@ -662,7 +621,6 @@ uint32_t lte_rate_matching_cc(uint32_t RCC,
ind=0;
}
e[k] = w[ind];
#ifdef RM_DEBUG_CC
// printf("k %d ind %d, w %c(%d)\n",k,ind,w[ind],w[ind]);
......@@ -695,10 +653,7 @@ int lte_rate_matching_turbo_rx(uint32_t RTC,
uint8_t Qm,
uint8_t Nl,
uint8_t r,
uint32_t *E_out)
{
uint32_t *E_out) {
uint32_t Nir,Ncb,Gp,GpmodC,E,Ncbmod,ind,k;
int16_t *soft_input2;
// int32_t w_tmp;
......@@ -715,8 +670,7 @@ int lte_rate_matching_turbo_rx(uint32_t RTC,
if (Mdlharq>0) { // Downlink
Nir = Nsoft/Kmimo/cmin(8,Mdlharq);
Ncb = cmin(Nir/C,3*(RTC<<5));
}
else { // Uplink
} else { // Uplink
Nir=0;
Ncb = 3*(RTC<<5);
}
......@@ -726,17 +680,13 @@ int lte_rate_matching_turbo_rx(uint32_t RTC,
Gp = G/Nl/Qm;
GpmodC = Gp%C;
if (r < (C-(GpmodC)))
E = Nl*Qm * (Gp/C);
else
E = Nl*Qm * ((GpmodC==0?0:1) + (Gp/C));
Ncbmod = Ncb%(RTC<<3);
ind = RTC * (2+(rvidx*(((Ncbmod==0)?0:1) + (Ncb/(RTC<<3)))*2));
#ifdef RM_DEBUG
printf("lte_rate_matching_turbo_rx: Clear %d, E %d, Ncb %d, Kw %d, rvidx %d, G %d, Qm %d, Nl%d, r %d\n",clear,E,Ncb,3*(RTC<<5),rvidx, G, Qm,Nl,r);
#endif
......@@ -831,10 +781,8 @@ int lte_rate_matching_turbo_rx(uint32_t RTC,
ind=0;
}
*/
*E_out = E;
return(0);
}
......@@ -842,28 +790,19 @@ void lte_rate_matching_cc_rx(uint32_t RCC,
uint16_t E,
int8_t *w,
uint8_t *dummy_w,
int8_t *soft_input)
{
int8_t *soft_input) {
uint32_t ind=0,k;
uint16_t Kw = 3*(RCC<<5);
uint32_t acc=1;
int16_t w16[Kw];
#ifdef RM_DEBUG_CC
uint32_t nulled=0;
printf("lte_rate_matching_cc_rx: Kw %d, E %d, w %p, soft_input %p\n",3*(RCC<<5),E,w,soft_input);
#endif
memset(w,0,Kw);
memset(w16,0,Kw*sizeof(int16_t));
for (k=0; k<E; k++) {
while(dummy_w[ind] == LTE_NULL) {
#ifdef RM_DEBUG_CC
nulled++;
......@@ -883,10 +822,7 @@ void lte_rate_matching_cc_rx(uint32_t RCC,
#ifdef RM_DEBUG_CC
printf("RM_RX_CC k %d (%d) ind: %d (%d)\n",k,soft_input[k],ind,w16[ind]);
#endif
w16[ind] += soft_input[k];
ind++;
if (ind==Kw) {
......@@ -907,7 +843,6 @@ void lte_rate_matching_cc_rx(uint32_t RCC,
}
#ifdef RM_DEBUG_CC
printf("Nulled %d\n",nulled);
#endif
}
......@@ -915,8 +850,7 @@ void lte_rate_matching_cc_rx(uint32_t RCC,
#ifdef MAIN
void main()
{
void main() {
uint8_t d[96+3+(3*6144)];
uint8_t w[3*6144],e[12*6144];
uint32_t RTC,G,rvidx;
......@@ -924,7 +858,6 @@ void main()
uint32_t mod_order = 4;
uint32_t first_dlsch_symbol = 2;
uint32_t i;
G = ( nb_rb * (12 * mod_order) * (12-first_dlsch_symbol-3)) ;//( nb_rb * (12 * mod_order) * (14-first_dlsch_symbol-3)) :
// initialize 96 first positions to "LTE_NULL"
......
......@@ -38,9 +38,7 @@ int lte_segmentation(unsigned char *input_buffer,
unsigned int *Cminus,
unsigned int *Kplus,
unsigned int *Kminus,
unsigned int *F)
{
unsigned int *F) {
unsigned int L,Bprime,Bprime_by_C,r,Kr,k,s,crc;
if (B<=6144) {
......@@ -56,7 +54,7 @@ int lte_segmentation(unsigned char *input_buffer,
Bprime = B+((*C)*L);
#ifdef DEBUG_SEGMENTATION
printf("Bprime %d\n",Bprime);
printf("Bprime %u\n",Bprime);
#endif
}
......@@ -68,7 +66,7 @@ int lte_segmentation(unsigned char *input_buffer,
// Find K+
Bprime_by_C = Bprime/(*C);
#ifdef DEBUG_SEGMENTATION
printf("Bprime_by_C %d\n",Bprime_by_C);
printf("Bprime_by_C %u\n",Bprime_by_C);
#endif
// Bprime = Bprime_by_C>>3;
......@@ -93,17 +91,16 @@ int lte_segmentation(unsigned char *input_buffer,
*Kminus = (*Kplus - 32);
} else if (Bprime_by_C <=6144 ) { // increase by 8 bytes til here
*Kplus = (Bprime_by_C>>6)<<6;
#ifdef DEBUG_SEGMENTATION
printf("Bprime_by_C_by_C %d , Kplus %d\n",Bprime_by_C,*Kplus);
printf("Bprime_by_C_by_C %u , Kplus %u\n",Bprime_by_C,*Kplus);
#endif
if (*Kplus < Bprime_by_C)
*Kplus = *Kplus + 64;
#ifdef DEBUG_SEGMENTATION
printf("Bprime_by_C_by_C %d , Kplus2 %d\n",Bprime_by_C,*Kplus);
printf("Bprime_by_C_by_C %u , Kplus2 %u\n",Bprime_by_C,*Kplus);
#endif
*Kminus = (*Kplus - 64);
} else {
......@@ -116,25 +113,21 @@ int lte_segmentation(unsigned char *input_buffer,
*Kminus = 0;
*Cminus = 0;
} else {
// printf("More than one segment (%d), exiting \n",*C);
// exit(-1);
*Cminus = ((*C)*(*Kplus) - (Bprime))/((*Kplus) - (*Kminus));
*Cplus = (*C) - (*Cminus);
}
AssertFatal(Bprime <= (*Cplus)*(*Kplus) + (*Cminus)*(*Kminus),
"Bprime %d < (*Cplus %d)*(*Kplus %d) + (*Cminus %d)*(*Kminus %d)\n",
Bprime,*Cplus,*Kplus,*Cminus,*Kminus);
*F = ((*Cplus)*(*Kplus) + (*Cminus)*(*Kminus) - (Bprime));
#ifdef DEBUG_SEGMENTATION
printf("C %d, Cplus %d, Cminus %d, Kplus %d, Kminus %d, Bprime_bytes %d, Bprime %d, F %d\n",*C,*Cplus,*Cminus,*Kplus,*Kminus,Bprime>>3,Bprime,*F);
printf("C %u, Cplus %u, Cminus %u, Kplus %u, Kminus %u, Bprime_bytes %u, Bprime %u, F %u\n",*C,*Cplus,*Cminus,*Kplus,*Kminus,Bprime>>3,Bprime,*F);
#endif
if ((input_buffer) && (output_buffers)) {
for (k=0; k<*F>>3; k++) {
output_buffers[0][k] = 0;
}
......@@ -142,7 +135,6 @@ int lte_segmentation(unsigned char *input_buffer,
s=0;
for (r=0; r<*C; r++) {
if (r<*Cminus)
Kr = *Kminus;
else
......@@ -157,11 +149,11 @@ int lte_segmentation(unsigned char *input_buffer,
if (*C > 1) { // add CRC
crc = crc24b(output_buffers[r],Kr-24)>>8;
output_buffers[r][(Kr-24)>>3] = ((uint8_t*)&crc)[2];
output_buffers[r][1+((Kr-24)>>3)] = ((uint8_t*)&crc)[1];
output_buffers[r][2+((Kr-24)>>3)] = ((uint8_t*)&crc)[0];
output_buffers[r][(Kr-24)>>3] = ((uint8_t *)&crc)[2];
output_buffers[r][1+((Kr-24)>>3)] = ((uint8_t *)&crc)[1];
output_buffers[r][2+((Kr-24)>>3)] = ((uint8_t *)&crc)[0];
#ifdef DEBUG_SEGMENTATION
printf("Segment %d : CRC %x\n",r,crc);
printf("Segment %u : CRC %x\n",r,crc);
#endif
}
......@@ -175,9 +167,7 @@ int lte_segmentation(unsigned char *input_buffer,
#ifdef MAIN
main()
{
main() {
unsigned int Kplus,Kminus,C,Cplus,Cminus,F,Bbytes;
for (Bbytes=5; Bbytes<2*768; Bbytes++) {
......
......@@ -34,8 +34,7 @@ int lte_dl_channel_estimation(PHY_VARS_UE *ue,
unsigned char Ns,
unsigned char p,
unsigned char l,
unsigned char symbol)
{
unsigned char symbol) {
int pilot[2][200] __attribute__((aligned(16)));
unsigned char nu,aarx;
unsigned short k;
......@@ -45,16 +44,14 @@ int lte_dl_channel_estimation(PHY_VARS_UE *ue,
// unsigned int n;
// int i;
static int interpolateS11S12 = 1;
uint16_t Nid_cell = (eNB_offset == 0) ? ue->frame_parms.Nid_cell : ue->measurements.adj_cell_id[eNB_offset-1];
uint8_t nushift,pilot0,pilot1,pilot2,pilot3;
uint8_t previous_thread_id = ue->current_thread_id[Ns>>1]==0 ? (RX_NB_TH-1):(ue->current_thread_id[Ns>>1]-1);
int **dl_ch_estimates =ue->common_vars.common_vars_rx_data_per_thread[ue->current_thread_id[Ns>>1]].dl_ch_estimates[eNB_offset];
int **dl_ch_estimates_previous=ue->common_vars.common_vars_rx_data_per_thread[previous_thread_id].dl_ch_estimates[eNB_offset];
int **rxdataF=ue->common_vars.common_vars_rx_data_per_thread[ue->current_thread_id[Ns>>1]].rxdataF;
pilot0 = 0;
if (ue->frame_parms.Ncp == 0) { // normal prefix
pilot1 = 4;
pilot2 = 7;
......@@ -81,7 +78,6 @@ int lte_dl_channel_estimation(PHY_VARS_UE *ue,
return(-1);
}
//ch_offset = (l*(ue->frame_parms.ofdm_symbol_size));
if (ue->high_speed_flag == 0) // use second channel estimate position for temporary storage
ch_offset = ue->frame_parms.ofdm_symbol_size ;
......@@ -89,11 +85,10 @@ int lte_dl_channel_estimation(PHY_VARS_UE *ue,
ch_offset = ue->frame_parms.ofdm_symbol_size*symbol;
symbol_offset = ue->frame_parms.ofdm_symbol_size*symbol;
k = (nu + nushift)%6;
#ifdef DEBUG_CH
printf("Channel Estimation : ThreadId %d, eNB_offset %d cell_id %d ch_offset %d, OFDM size %d, Ncp=%d, l=%d, Ns=%d, k=%d\n",ue->current_thread_id[Ns>>1], eNB_offset,Nid_cell,ch_offset,ue->frame_parms.ofdm_symbol_size,
printf("Channel Estimation : ThreadId %d, eNB_offset %d cell_id %d ch_offset %d, OFDM size %d, Ncp=%d, l=%d, Ns=%d, k=%d\n",ue->current_thread_id[Ns>>1], eNB_offset,Nid_cell,ch_offset,
ue->frame_parms.ofdm_symbol_size,
ue->frame_parms.Ncp,l,Ns,k);
#endif
......@@ -107,10 +102,8 @@ int lte_dl_channel_estimation(PHY_VARS_UE *ue,
fr=filt24_0r2; //for first pilot of rightmost RB
// f2r2=filt24_0r2;
f2r2=filt24_2r;
f_dc=filt24_0_dcr;
f2_dc=filt24_2_dcl;
break;
case 1 :
......@@ -174,8 +167,6 @@ int lte_dl_channel_estimation(PHY_VARS_UE *ue,
break;
}
// generate pilot
lte_dl_cell_spec_rx(ue,
eNB_offset,
......@@ -184,27 +175,25 @@ int lte_dl_channel_estimation(PHY_VARS_UE *ue,
(l==0)?0:1,
p);
for (aarx=0; aarx<ue->frame_parms.nb_antennas_rx; aarx++) {
pil = (int16_t *)&pilot[p][0];
rxF = (int16_t *)&rxdataF[aarx][((symbol_offset+k+ue->frame_parms.first_carrier_offset))];
dl_ch = (int16_t *)&dl_ch_estimates[(p<<1)+aarx][ch_offset];
// if (eNb_id==0)
memset(dl_ch,0,4*(ue->frame_parms.ofdm_symbol_size));
if (ue->high_speed_flag==0) // multiply previous channel estimate by ch_est_alpha
multadd_complex_vector_real_scalar(dl_ch-(ue->frame_parms.ofdm_symbol_size<<1),
ue->ch_est_alpha,dl_ch-(ue->frame_parms.ofdm_symbol_size<<1),
1,ue->frame_parms.ofdm_symbol_size);
#ifdef DEBUG_CH
printf("k %d, first_carrier %d\n",k,ue->frame_parms.first_carrier_offset);
#endif
if ((ue->frame_parms.N_RB_DL==6) ||
(ue->frame_parms.N_RB_DL==50) ||
(ue->frame_parms.N_RB_DL==100)) {
//First half of pilots
// Treat first 2 pilots specially (left edge)
ch[0] = (int16_t)(((int32_t)pil[0]*rxF[0] - (int32_t)pil[1]*rxF[1])>>15);
......@@ -219,7 +208,6 @@ int lte_dl_channel_estimation(PHY_VARS_UE *ue,
pil+=2; // Re Im
rxF+=12;
dl_ch+=8;
ch[0] = (int16_t)(((int32_t)pil[0]*rxF[0] - (int32_t)pil[1]*rxF[1])>>15);
ch[1] = (int16_t)(((int32_t)pil[0]*rxF[1] + (int32_t)pil[1]*rxF[0])>>15);
#ifdef DEBUG_CH
......@@ -234,28 +222,22 @@ int lte_dl_channel_estimation(PHY_VARS_UE *ue,
dl_ch+=16;
for (pilot_cnt=2; pilot_cnt<((ue->frame_parms.N_RB_DL)-1); pilot_cnt+=2) {
ch[0] = (int16_t)(((int32_t)pil[0]*rxF[0] - (int32_t)pil[1]*rxF[1])>>15); //Re
ch[1] = (int16_t)(((int32_t)pil[0]*rxF[1] + (int32_t)pil[1]*rxF[0])>>15); //Im
#ifdef DEBUG_CH
printf("pilot %d : rxF - > (%d,%d) ch -> (%d,%d), pil -> (%d,%d) \n",pilot_cnt,rxF[0],rxF[1],ch[0],ch[1],pil[0],pil[1]);
printf("pilot %u : rxF - > (%d,%d) ch -> (%d,%d), pil -> (%d,%d) \n",pilot_cnt,rxF[0],rxF[1],ch[0],ch[1],pil[0],pil[1]);
#endif
multadd_real_vector_complex_scalar(f,
ch,
dl_ch,
24);
pil+=2; // Re Im
rxF+=12;
dl_ch+=8;
ch[0] = (int16_t)(((int32_t)pil[0]*rxF[0] - (int32_t)pil[1]*rxF[1])>>15);
ch[1] = (int16_t)(((int32_t)pil[0]*rxF[1] + (int32_t)pil[1]*rxF[0])>>15);
#ifdef DEBUG_CH
printf("pilot %d : rxF - > (%d,%d) ch -> (%d,%d), pil -> (%d,%d) \n",pilot_cnt+1,rxF[0],rxF[1],ch[0],ch[1],pil[0],pil[1]);
printf("pilot %u : rxF - > (%d,%d) ch -> (%d,%d), pil -> (%d,%d) \n",pilot_cnt+1,rxF[0],rxF[1],ch[0],ch[1],pil[0],pil[1]);
#endif
multadd_real_vector_complex_scalar(f2,
ch,
......@@ -264,29 +246,25 @@ int lte_dl_channel_estimation(PHY_VARS_UE *ue,
pil+=2;
rxF+=12;
dl_ch+=16;
}
// printf("Second half\n");
// Second half of RBs
k = (nu + nushift)%6;
if (k > 6)
k -=6;
rxF = (int16_t *)&rxdataF[aarx][((symbol_offset+1+k))];
#ifdef DEBUG_CH
printf("second half k %d\n",k);
#endif
for (pilot_cnt=0; pilot_cnt<((ue->frame_parms.N_RB_DL)-3); pilot_cnt+=2) {
for (pilot_cnt=0; pilot_cnt<((ue->frame_parms.N_RB_DL)-3); pilot_cnt+=2) {
ch[0] = (int16_t)(((int32_t)pil[0]*rxF[0] - (int32_t)pil[1]*rxF[1])>>15);
ch[1] = (int16_t)(((int32_t)pil[0]*rxF[1] + (int32_t)pil[1]*rxF[0])>>15);
#ifdef DEBUG_CH
printf("pilot %d : rxF - > (%d,%d) ch -> (%d,%d), pil -> (%d,%d) \n",pilot_cnt,rxF[0],rxF[1],ch[0],ch[1],pil[0],pil[1]);
printf("pilot %u : rxF - > (%d,%d) ch -> (%d,%d), pil -> (%d,%d) \n",pilot_cnt,rxF[0],rxF[1],ch[0],ch[1],pil[0],pil[1]);
#endif
multadd_real_vector_complex_scalar(f,
ch,
......@@ -295,11 +273,10 @@ int lte_dl_channel_estimation(PHY_VARS_UE *ue,
pil+=2;
rxF+=12;
dl_ch+=8;
ch[0] = (int16_t)(((int32_t)pil[0]*rxF[0] - (int32_t)pil[1]*rxF[1])>>15);
ch[1] = (int16_t)(((int32_t)pil[0]*rxF[1] + (int32_t)pil[1]*rxF[0])>>15);
#ifdef DEBUG_CH
printf("pilot %d : rxF - > (%d,%d) ch -> (%d,%d), pil -> (%d,%d) \n",pilot_cnt+1,rxF[0],rxF[1],ch[0],ch[1],pil[0],pil[1]);
printf("pilot %u : rxF - > (%d,%d) ch -> (%d,%d), pil -> (%d,%d) \n",pilot_cnt+1,rxF[0],rxF[1],ch[0],ch[1],pil[0],pil[1]);
#endif
multadd_real_vector_complex_scalar(f2,
ch,
......@@ -308,13 +285,12 @@ int lte_dl_channel_estimation(PHY_VARS_UE *ue,
pil+=2;
rxF+=12;
dl_ch+=16;
}
ch[0] = (int16_t)(((int32_t)pil[0]*rxF[0] - (int32_t)pil[1]*rxF[1])>>15);
ch[1] = (int16_t)(((int32_t)pil[0]*rxF[1] + (int32_t)pil[1]*rxF[0])>>15);
#ifdef DEBUG_CH
printf("pilot %d: rxF -> (%d,%d) ch -> (%d,%d), pil -> (%d,%d) \n",pilot_cnt,rxF[0],rxF[1],ch[0],ch[1],pil[0],pil[1]);
printf("pilot %u: rxF -> (%d,%d) ch -> (%d,%d), pil -> (%d,%d) \n",pilot_cnt,rxF[0],rxF[1],ch[0],ch[1],pil[0],pil[1]);
#endif
multadd_real_vector_complex_scalar(fr,
ch,
......@@ -323,34 +299,25 @@ int lte_dl_channel_estimation(PHY_VARS_UE *ue,
pil+=2; // Re Im
rxF+=12;
dl_ch+=8;
ch[0] = (int16_t)(((int32_t)pil[0]*rxF[0] - (int32_t)pil[1]*rxF[1])>>15);
ch[1] = (int16_t)(((int32_t)pil[0]*rxF[1] + (int32_t)pil[1]*rxF[0])>>15);
#ifdef DEBUG_CH
printf("pilot %d: rxF - > (%d,%d) ch -> (%d,%d), pil -> (%d,%d) \n",pilot_cnt+1,rxF[0],rxF[1],ch[0],ch[1],pil[0],pil[1]);
printf("pilot %u: rxF - > (%d,%d) ch -> (%d,%d), pil -> (%d,%d) \n",pilot_cnt+1,rxF[0],rxF[1],ch[0],ch[1],pil[0],pil[1]);
#endif
multadd_real_vector_complex_scalar(f2r2,
ch,
dl_ch,
24);
}
else if (ue->frame_parms.N_RB_DL==25) {
} else if (ue->frame_parms.N_RB_DL==25) {
//printf("Channel estimation\n");
// Treat first 2 pilots specially (left edge)
ch[0] = (int16_t)(((int32_t)pil[0]*rxF[0] - (int32_t)pil[1]*rxF[1])>>15);
ch[1] = (int16_t)(((int32_t)pil[0]*rxF[1] + (int32_t)pil[1]*rxF[0])>>15);
#ifdef DEBUG_CH
printf("pilot 0 : rxF - > (%d,%d) ch -> (%d,%d), pil -> (%d,%d) \n",rxF[0],rxF[1],ch[0],ch[1],pil[0],pil[1]);
// ch[0] = 1024;
// ch[1] = -128;
#endif
multadd_real_vector_complex_scalar(fl,
ch,
dl_ch,
......@@ -358,17 +325,13 @@ int lte_dl_channel_estimation(PHY_VARS_UE *ue,
pil+=2; // Re Im
rxF+=12;
dl_ch+=8;
ch[0] = (int16_t)(((int32_t)pil[0]*rxF[0] - (int32_t)pil[1]*rxF[1])>>15);
ch[1] = (int16_t)(((int32_t)pil[0]*rxF[1] + (int32_t)pil[1]*rxF[0])>>15);
#ifdef DEBUG_CH
printf("pilot 1 : rxF - > (%d,%d) ch -> (%d,%d), pil -> (%d,%d) \n",rxF[0],rxF[1],ch[0],ch[1],pil[0],pil[1]);
// ch[0] = 1024;
// ch[1] = -128;
#endif
multadd_real_vector_complex_scalar(f2l2,
ch,
dl_ch,
......@@ -378,21 +341,15 @@ int lte_dl_channel_estimation(PHY_VARS_UE *ue,
dl_ch+=16;
for (pilot_cnt=2; pilot_cnt<24; pilot_cnt+=2) {
// printf("pilot[%d][%d] (%d,%d)\n",p,rb,pil[0],pil[1]);
// printf("rx[%d][%d] -> (%d,%d)\n",p,ue->frame_parms.first_carrier_offset + ue->frame_parms.nushift + 6*rb+(3*p),rxF[0],rxF[1]);
ch[0] = (int16_t)(((int32_t)pil[0]*rxF[0] - (int32_t)pil[1]*rxF[1])>>15);
ch[1] = (int16_t)(((int32_t)pil[0]*rxF[1] + (int32_t)pil[1]*rxF[0])>>15);
#ifdef DEBUG_CH
printf("pilot %d : rxF - > (%d,%d) ch -> (%d,%d), pil -> (%d,%d) \n",pilot_cnt,rxF[0],rxF[1],ch[0],ch[1],pil[0],pil[1]);
printf("pilot %u : rxF - > (%d,%d) ch -> (%d,%d), pil -> (%d,%d) \n",pilot_cnt,rxF[0],rxF[1],ch[0],ch[1],pil[0],pil[1]);
// ch[0] = 1024;
// ch[1] = -128;
#endif
multadd_real_vector_complex_scalar(f,
ch,
dl_ch,
......@@ -400,13 +357,10 @@ int lte_dl_channel_estimation(PHY_VARS_UE *ue,
pil+=2; // Re Im
rxF+=12;
dl_ch+=8;
ch[0] = (int16_t)(((int32_t)pil[0]*rxF[0] - (int32_t)pil[1]*rxF[1])>>15);
ch[1] = (int16_t)(((int32_t)pil[0]*rxF[1] + (int32_t)pil[1]*rxF[0])>>15);
#ifdef DEBUG_CH
printf("pilot %d : rxF - > (%d,%d) ch -> (%d,%d), pil -> (%d,%d) \n",pilot_cnt+1,rxF[0],rxF[1],ch[0],ch[1],pil[0],pil[1]);
printf("pilot %u : rxF - > (%d,%d) ch -> (%d,%d), pil -> (%d,%d) \n",pilot_cnt+1,rxF[0],rxF[1],ch[0],ch[1],pil[0],pil[1]);
// ch[0] = 1024;
// ch[1] = -128;
#endif
......@@ -417,39 +371,31 @@ int lte_dl_channel_estimation(PHY_VARS_UE *ue,
pil+=2;
rxF+=12;
dl_ch+=16;
}
ch[0] = (int16_t)(((int32_t)pil[0]*rxF[0] - (int32_t)pil[1]*rxF[1])>>15);
ch[1] = (int16_t)(((int32_t)pil[0]*rxF[1] + (int32_t)pil[1]*rxF[0])>>15);
#ifdef DEBUG_CH
printf("pilot 24: rxF -> (%d,%d) ch -> (%d,%d), pil -> (%d,%d) \n",rxF[0],rxF[1],ch[0],ch[1],pil[0],pil[1]);
// ch[0] = 1024;
// ch[1] = -128;
#endif
multadd_real_vector_complex_scalar(f_dc,
ch,
dl_ch,
24);
pil+=2; // Re Im
dl_ch+=8;
// printf("Second half\n");
// Second half of RBs
rxF = (int16_t *)&rxdataF[aarx][((symbol_offset+1+k))];
ch[0] = (int16_t)(((int32_t)pil[0]*rxF[0] - (int32_t)pil[1]*rxF[1])>>15);
ch[1] = (int16_t)(((int32_t)pil[0]*rxF[1] + (int32_t)pil[1]*rxF[0])>>15);
#ifdef DEBUG_CH
printf("pilot 25: rxF -> (%d,%d) ch -> (%d,%d), pil -> (%d,%d) \n",rxF[0],rxF[1],ch[0],ch[1],pil[0],pil[1]);
// ch[0] = 1024;
// ch[1] = -128;
#endif
multadd_real_vector_complex_scalar(f2_dc,
ch,
dl_ch,
......@@ -459,19 +405,15 @@ int lte_dl_channel_estimation(PHY_VARS_UE *ue,
dl_ch+=16;
for (pilot_cnt=0; pilot_cnt<22; pilot_cnt+=2) {
// printf("* pilot[%d][%d] (%d,%d)\n",p,rb,pil[0],pil[1]);
// printf("rx[%d][%d] -> (%d,%d)\n",p,ue->frame_parms.first_carrier_offset + ue->frame_parms.nushift + 6*rb+(3*p),rxF[0],rxF[1]);
ch[0] = (int16_t)(((int32_t)pil[0]*rxF[0] - (int32_t)pil[1]*rxF[1])>>15);
ch[1] = (int16_t)(((int32_t)pil[0]*rxF[1] + (int32_t)pil[1]*rxF[0])>>15);
#ifdef DEBUG_CH
printf("pilot %d rxF -> (%d,%d) ch -> (%d,%d), pil -> (%d,%d) \n",26+pilot_cnt,rxF[0],rxF[1],ch[0],ch[1],pil[0],pil[1]);
printf("pilot %u rxF -> (%d,%d) ch -> (%d,%d), pil -> (%d,%d) \n",26+pilot_cnt,rxF[0],rxF[1],ch[0],ch[1],pil[0],pil[1]);
// ch[0] = 1024;
// ch[1] = -128;
#endif
multadd_real_vector_complex_scalar(f,
ch,
dl_ch,
......@@ -479,16 +421,13 @@ int lte_dl_channel_estimation(PHY_VARS_UE *ue,
pil+=2;
rxF+=12;
dl_ch+=8;
ch[0] = (int16_t)(((int32_t)pil[0]*rxF[0] - (int32_t)pil[1]*rxF[1])>>15);
ch[1] = (int16_t)(((int32_t)pil[0]*rxF[1] + (int32_t)pil[1]*rxF[0])>>15);
#ifdef DEBUG_CH
printf("pilot %d : rxF -> (%d,%d) ch -> (%d,%d), pil -> (%d,%d) \n",27+pilot_cnt,rxF[0],rxF[1],ch[0],ch[1],pil[0],pil[1]);
printf("pilot %u : rxF -> (%d,%d) ch -> (%d,%d), pil -> (%d,%d) \n",27+pilot_cnt,rxF[0],rxF[1],ch[0],ch[1],pil[0],pil[1]);
// ch[0] = 1024;
// ch[1] = -128;
#endif
multadd_real_vector_complex_scalar(f2,
ch,
dl_ch,
......@@ -496,20 +435,15 @@ int lte_dl_channel_estimation(PHY_VARS_UE *ue,
pil+=2;
rxF+=12;
dl_ch+=16;
}
ch[0] = (int16_t)(((int32_t)pil[0]*rxF[0] - (int32_t)pil[1]*rxF[1])>>15);
ch[1] = (int16_t)(((int32_t)pil[0]*rxF[1] + (int32_t)pil[1]*rxF[0])>>15);
#ifdef DEBUG_CH
printf("pilot 49: rxF -> (%d,%d) ch -> (%d,%d), pil -> (%d,%d) \n",rxF[0],rxF[1],ch[0],ch[1],pil[0],pil[1]);
// ch[0] = 1024;
// ch[1] = -128;
#endif
multadd_real_vector_complex_scalar(fr,
ch,
dl_ch,
......@@ -517,28 +451,20 @@ int lte_dl_channel_estimation(PHY_VARS_UE *ue,
pil+=2; // Re Im
rxF+=12;
dl_ch+=8;
ch[0] = (int16_t)(((int32_t)pil[0]*rxF[0] - (int32_t)pil[1]*rxF[1])>>15);
ch[1] = (int16_t)(((int32_t)pil[0]*rxF[1] + (int32_t)pil[1]*rxF[0])>>15);
#ifdef DEBUG_CH
printf("pilot 50: rxF - > (%d,%d) ch -> (%d,%d), pil -> (%d,%d) \n",rxF[0],rxF[1],ch[0],ch[1],pil[0],pil[1]);
// ch[0] = 1024;
// ch[1] = -128;
#endif
multadd_real_vector_complex_scalar(f2r2,
ch,
dl_ch,
24);
} else if (ue->frame_parms.N_RB_DL==15) {
//printf("First Half\n");
for (rb=0; rb<28; rb+=4) {
//printf("aarx=%d\n",aarx);
//printf("pilot[%d][%d] (%d,%d)\n",p,rb,pil[0],pil[1]);
//printf("rx[%d][%d] -> (%d,%d)\n",p,
......@@ -555,7 +481,6 @@ int lte_dl_channel_estimation(PHY_VARS_UE *ue,
pil+=2; // Re Im
rxF+=12;
dl_ch+=8;
ch[0] = (int16_t)(((int32_t)pil[0]*rxF[0] - (int32_t)pil[1]*rxF[1])>>15);
ch[1] = (int16_t)(((int32_t)pil[0]*rxF[1] + (int32_t)pil[1]*rxF[0])>>15);
//printf("ch -> (%d,%d)\n",ch[0],ch[1]);
......@@ -566,7 +491,6 @@ int lte_dl_channel_estimation(PHY_VARS_UE *ue,
pil+=2;
rxF+=12;
dl_ch+=16;
}
ch[0] = (int16_t)(((int32_t)pil[0]*rxF[0] - (int32_t)pil[1]*rxF[1])>>15);
......@@ -578,13 +502,11 @@ int lte_dl_channel_estimation(PHY_VARS_UE *ue,
24);
pil+=2; // Re Im
dl_ch+=8;
//printf("Second half\n");
//Second half of RBs
rxF = (int16_t *)&rxdataF[aarx][((symbol_offset+1+nushift + (3*p)))];
ch[0] = (int16_t)(((int32_t)pil[0]*rxF[0] - (int32_t)pil[1]*rxF[1])>>15);
ch[1] = (int16_t)(((int32_t)pil[0]*rxF[1] + (int32_t)pil[1]*rxF[0])>>15);
multadd_real_vector_complex_scalar(f2,
ch,
dl_ch,
......@@ -602,7 +524,6 @@ int lte_dl_channel_estimation(PHY_VARS_UE *ue,
// rxF[1]);
ch[0] = (int16_t)(((int32_t)pil[0]*rxF[0] - (int32_t)pil[1]*rxF[1])>>15);
ch[1] = (int16_t)(((int32_t)pil[0]*rxF[1] + (int32_t)pil[1]*rxF[0])>>15);
multadd_real_vector_complex_scalar(f,
ch,
dl_ch,
......@@ -610,10 +531,8 @@ int lte_dl_channel_estimation(PHY_VARS_UE *ue,
pil+=2;
rxF+=12;
dl_ch+=8;
ch[0] = (int16_t)(((int32_t)pil[0]*rxF[0] - (int32_t)pil[1]*rxF[1])>>15);
ch[1] = (int16_t)(((int32_t)pil[0]*rxF[1] + (int32_t)pil[1]*rxF[0])>>15);
multadd_real_vector_complex_scalar(f2,
ch,
dl_ch,
......@@ -621,17 +540,14 @@ int lte_dl_channel_estimation(PHY_VARS_UE *ue,
pil+=2;
rxF+=12;
dl_ch+=16;
}
} else {
LOG_E(PHY,"channel estimation not implemented for ue->frame_parms.N_RB_DL = %d\n",ue->frame_parms.N_RB_DL);
}
if (ue->perfect_ce == 0) {
// Temporal Interpolation
// printf("ch_offset %d\n",ch_offset);
dl_ch = (int16_t *)&dl_ch_estimates[(p<<1)+aarx][ch_offset];
if (ue->high_speed_flag == 0) {
......@@ -642,14 +558,11 @@ int lte_dl_channel_estimation(PHY_VARS_UE *ue,
if ((symbol == 0)) {
// printf("Interpolating %d->0\n",4-ue->frame_parms.Ncp);
// dl_ch_prev = (int16_t *)&dl_ch_estimates[(p<<1)+aarx][(4-ue->frame_parms.Ncp)*(ue->frame_parms.ofdm_symbol_size)];
if(((Ns>>1)!=0) || ( ((Ns>>1)==0) && interpolateS11S12))
{
if(((Ns>>1)!=0) || ( ((Ns>>1)==0) && interpolateS11S12)) {
//LOG_I(PHY,"Interpolate s11-->s0 to get s12 and s13 Ns %d \n", Ns);
dl_ch_prev = (int16_t *)&dl_ch_estimates_previous[(p<<1)+aarx][pilot3*(ue->frame_parms.ofdm_symbol_size)];
multadd_complex_vector_real_scalar(dl_ch_prev,21845,dl_ch_prev+(2*(ue->frame_parms.ofdm_symbol_size)),1,ue->frame_parms.ofdm_symbol_size);
multadd_complex_vector_real_scalar(dl_ch,10923,dl_ch_prev+(2*(ue->frame_parms.ofdm_symbol_size)),0,ue->frame_parms.ofdm_symbol_size);
multadd_complex_vector_real_scalar(dl_ch_prev,10923,dl_ch_prev+(2*((ue->frame_parms.ofdm_symbol_size)<<1)),1,ue->frame_parms.ofdm_symbol_size);
multadd_complex_vector_real_scalar(dl_ch,21845,dl_ch_prev+(2*((ue->frame_parms.ofdm_symbol_size)<<1)),0,ue->frame_parms.ofdm_symbol_size);
}
......@@ -661,50 +574,38 @@ int lte_dl_channel_estimation(PHY_VARS_UE *ue,
//LOG_I(PHY,"Interpolate s0-->s4 to get s1 s2 and s3 Ns %d \n", Ns);
if (ue->frame_parms.Ncp==0) {// pilot spacing 4 symbols (1/4,1/2,3/4 combination)
uint8_t previous_subframe;
if(Ns>>1 == 0)
previous_subframe = 9;
else
previous_subframe = ((Ns>>1) - 1 )%9;
if((subframe_select(&ue->frame_parms,previous_subframe) == SF_UL))
{
if((subframe_select(&ue->frame_parms,previous_subframe) == SF_UL)) {
multadd_complex_vector_real_scalar(dl_ch_prev,328,dl_ch_prev+(2*(ue->frame_parms.ofdm_symbol_size)),1,ue->frame_parms.ofdm_symbol_size);
multadd_complex_vector_real_scalar(dl_ch,32440,dl_ch_prev+(2*(ue->frame_parms.ofdm_symbol_size)),0,ue->frame_parms.ofdm_symbol_size);
multadd_complex_vector_real_scalar(dl_ch_prev,328,dl_ch_prev+(2*((ue->frame_parms.ofdm_symbol_size)<<1)),1,ue->frame_parms.ofdm_symbol_size);
multadd_complex_vector_real_scalar(dl_ch,32440,dl_ch_prev+(2*((ue->frame_parms.ofdm_symbol_size)<<1)),0,ue->frame_parms.ofdm_symbol_size);
multadd_complex_vector_real_scalar(dl_ch_prev,8192,dl_ch_prev+(3*2*(ue->frame_parms.ofdm_symbol_size)),1,ue->frame_parms.ofdm_symbol_size);
multadd_complex_vector_real_scalar(dl_ch,32440,dl_ch_prev+(3*2*(ue->frame_parms.ofdm_symbol_size)),0,ue->frame_parms.ofdm_symbol_size);
}
else
{
} else {
multadd_complex_vector_real_scalar(dl_ch_prev,24576,dl_ch_prev+(2*(ue->frame_parms.ofdm_symbol_size)),1,ue->frame_parms.ofdm_symbol_size);
multadd_complex_vector_real_scalar(dl_ch,8192,dl_ch_prev+(2*(ue->frame_parms.ofdm_symbol_size)),0,ue->frame_parms.ofdm_symbol_size);
multadd_complex_vector_real_scalar(dl_ch_prev,16384,dl_ch_prev+(2*((ue->frame_parms.ofdm_symbol_size)<<1)),1,ue->frame_parms.ofdm_symbol_size);
multadd_complex_vector_real_scalar(dl_ch,16384,dl_ch_prev+(2*((ue->frame_parms.ofdm_symbol_size)<<1)),0,ue->frame_parms.ofdm_symbol_size);
multadd_complex_vector_real_scalar(dl_ch_prev,8192,dl_ch_prev+(3*2*(ue->frame_parms.ofdm_symbol_size)),1,ue->frame_parms.ofdm_symbol_size);
multadd_complex_vector_real_scalar(dl_ch,24576,dl_ch_prev+(3*2*(ue->frame_parms.ofdm_symbol_size)),0,ue->frame_parms.ofdm_symbol_size);
}
} else {
multadd_complex_vector_real_scalar(dl_ch_prev,328,dl_ch_prev+(2*(ue->frame_parms.ofdm_symbol_size)),1,ue->frame_parms.ofdm_symbol_size);
multadd_complex_vector_real_scalar(dl_ch,21845,dl_ch_prev+(2*(ue->frame_parms.ofdm_symbol_size)),0,ue->frame_parms.ofdm_symbol_size);
multadd_complex_vector_real_scalar(dl_ch_prev,21845,dl_ch_prev+(2*(ue->frame_parms.ofdm_symbol_size)<<1),1,ue->frame_parms.ofdm_symbol_size);
multadd_complex_vector_real_scalar(dl_ch,10923,dl_ch_prev+(2*((ue->frame_parms.ofdm_symbol_size)<<1)),0,ue->frame_parms.ofdm_symbol_size);
} // pilot spacing 3 symbols (1/3,2/3 combination)
} else if (symbol == pilot2) {
dl_ch_prev = (int16_t *)&dl_ch_estimates[(p<<1)+aarx][pilot1*(ue->frame_parms.ofdm_symbol_size)];
multadd_complex_vector_real_scalar(dl_ch_prev,21845,dl_ch_prev+(2*(ue->frame_parms.ofdm_symbol_size)),1,ue->frame_parms.ofdm_symbol_size);
multadd_complex_vector_real_scalar(dl_ch,10923,dl_ch_prev+(2*(ue->frame_parms.ofdm_symbol_size)),0,ue->frame_parms.ofdm_symbol_size);
multadd_complex_vector_real_scalar(dl_ch_prev,10923,dl_ch_prev+(2*((ue->frame_parms.ofdm_symbol_size)<<1)),1,ue->frame_parms.ofdm_symbol_size);
multadd_complex_vector_real_scalar(dl_ch,21845,dl_ch_prev+(2*((ue->frame_parms.ofdm_symbol_size)<<1)),0,ue->frame_parms.ofdm_symbol_size);
} else { // symbol == pilot3
......@@ -714,54 +615,45 @@ int lte_dl_channel_estimation(PHY_VARS_UE *ue,
if (ue->frame_parms.Ncp==0) {// pilot spacing 4 symbols (1/4,1/2,3/4 combination)
multadd_complex_vector_real_scalar(dl_ch_prev,24576,dl_ch_prev+(2*(ue->frame_parms.ofdm_symbol_size)),1,ue->frame_parms.ofdm_symbol_size);
multadd_complex_vector_real_scalar(dl_ch,8192,dl_ch_prev+(2*(ue->frame_parms.ofdm_symbol_size)),0,ue->frame_parms.ofdm_symbol_size);
multadd_complex_vector_real_scalar(dl_ch_prev,16384,dl_ch_prev+(2*((ue->frame_parms.ofdm_symbol_size)<<1)),1,ue->frame_parms.ofdm_symbol_size);
multadd_complex_vector_real_scalar(dl_ch,16384,dl_ch_prev+(2*((ue->frame_parms.ofdm_symbol_size)<<1)),0,ue->frame_parms.ofdm_symbol_size);
multadd_complex_vector_real_scalar(dl_ch_prev,8192,dl_ch_prev+(3*2*(ue->frame_parms.ofdm_symbol_size)),1,ue->frame_parms.ofdm_symbol_size);
multadd_complex_vector_real_scalar(dl_ch,24576,dl_ch_prev+(3*2*(ue->frame_parms.ofdm_symbol_size)),0,ue->frame_parms.ofdm_symbol_size);
} else {
multadd_complex_vector_real_scalar(dl_ch_prev,10923,dl_ch_prev+(2*(ue->frame_parms.ofdm_symbol_size)),1,ue->frame_parms.ofdm_symbol_size);
multadd_complex_vector_real_scalar(dl_ch,21845,dl_ch_prev+(2*(ue->frame_parms.ofdm_symbol_size)),0,ue->frame_parms.ofdm_symbol_size);
multadd_complex_vector_real_scalar(dl_ch_prev,21845,dl_ch_prev+(2*(ue->frame_parms.ofdm_symbol_size)<<1),1,ue->frame_parms.ofdm_symbol_size);
multadd_complex_vector_real_scalar(dl_ch,10923,dl_ch_prev+(2*((ue->frame_parms.ofdm_symbol_size)<<1)),0,ue->frame_parms.ofdm_symbol_size);
} // pilot spacing 3 symbols (1/3,2/3 combination)
if((ue->rx_offset_diff !=0) && ((Ns>>1) == 9))
{
if((ue->rx_offset_diff !=0) && ((Ns>>1) == 9)) {
//LOG_I(PHY,"Extrapolate s7-->s11 to get s12 and s13 Ns %d\n", Ns);
interpolateS11S12 = 0;
//LOG_E(PHY,"Interpolate s7--s11 s12 s13 pilot 3 Ns %d l %d symbol %d \n", Ns, l, symbol);
int16_t *dlChEst_ofdm11 = (int16_t *)&dl_ch_estimates[(p<<1)+aarx][pilot3*(ue->frame_parms.ofdm_symbol_size)];
int16_t *dlChEst_ofdm7 = (int16_t *)&dl_ch_estimates[(p<<1)+aarx][pilot2*(ue->frame_parms.ofdm_symbol_size)];
// interpolate ofdm s12: 5/4*ofdms11 + -1/4*ofdms7 5/4 q1.15 40960 -1/4 q1.15 8192
int16_t *dlChEst_ofdm12 = (int16_t *)&dl_ch_estimates[(p<<1)+aarx][12*ue->frame_parms.ofdm_symbol_size];
for(int i=0; i<(2*ue->frame_parms.ofdm_symbol_size); i++)
{
for(int i=0; i<(2*ue->frame_parms.ofdm_symbol_size); i++) {
int64_t tmp_mult = 0;
tmp_mult = ((int64_t)dlChEst_ofdm11[i] * 40960 - (int64_t)dlChEst_ofdm7[i] * 8192);
tmp_mult = tmp_mult >> 15;
dlChEst_ofdm12[i] = tmp_mult;
}
// interpolate ofdm s13: 3/2*ofdms11 + -1/2*ofdms7 3/2 q1.15 49152 1/2 q1.15 16384
int16_t *dlChEst_ofdm13 = (int16_t *)&dl_ch_estimates[(p<<1)+aarx][13*ue->frame_parms.ofdm_symbol_size];
for(int i=0; i<(2*ue->frame_parms.ofdm_symbol_size); i++)
{
for(int i=0; i<(2*ue->frame_parms.ofdm_symbol_size); i++) {
int64_t tmp_mult = 0;
tmp_mult = ((int64_t)dlChEst_ofdm11[i] * 49152 - (int64_t)dlChEst_ofdm7[i] * 16384);
tmp_mult = tmp_mult >> 15;
dlChEst_ofdm13[i] = tmp_mult;
}
}
}
}
}
}
......@@ -797,16 +689,14 @@ int lte_dl_channel_estimation(PHY_VARS_UE *ue,
break;
}
if( ((Ns%2) == 0) && (l == pilot0))
{
if( ((Ns%2) == 0) && (l == pilot0)) {
// do ifft of channel estimate
for (aarx=0; aarx<ue->frame_parms.nb_antennas_rx; aarx++)
for (p=0; p<ue->frame_parms.nb_antenna_ports_eNB; p++) {
if (ue->common_vars.common_vars_rx_data_per_thread[ue->current_thread_id[Ns>>1]].dl_ch_estimates[eNB_offset][(p<<1)+aarx])
{
if (ue->common_vars.common_vars_rx_data_per_thread[ue->current_thread_id[Ns>>1]].dl_ch_estimates[eNB_offset][(p<<1)+aarx]) {
//LOG_I(PHY,"Channel Impulse Computation Slot %d ThreadId %d Symbol %d \n", Ns, ue->current_thread_id[Ns>>1], l);
idft((int16_t*) &ue->common_vars.common_vars_rx_data_per_thread[ue->current_thread_id[Ns>>1]].dl_ch_estimates[eNB_offset][(p<<1)+aarx][8],
(int16_t*) ue->common_vars.common_vars_rx_data_per_thread[ue->current_thread_id[Ns>>1]].dl_ch_estimates_time[eNB_offset][(p<<1)+aarx],1);
idft((int16_t *) &ue->common_vars.common_vars_rx_data_per_thread[ue->current_thread_id[Ns>>1]].dl_ch_estimates[eNB_offset][(p<<1)+aarx][8],
(int16_t *) ue->common_vars.common_vars_rx_data_per_thread[ue->current_thread_id[Ns>>1]].dl_ch_estimates_time[eNB_offset][(p<<1)+aarx],1);
}
}
}
......@@ -814,7 +704,6 @@ int lte_dl_channel_estimation(PHY_VARS_UE *ue,
T(T_UE_PHY_DL_CHANNEL_ESTIMATE, T_INT(eNB_id),
T_INT(ue->proc.proc_rxtx[ue->current_thread_id[Ns>>1]].frame_rx%1024), T_INT(ue->proc.proc_rxtx[ue->current_thread_id[Ns>>1]].subframe_rx),
T_INT(0), T_BUFFER(&ue->common_vars.common_vars_rx_data_per_thread[ue->current_thread_id[Ns>>1]].dl_ch_estimates_time[eNB_offset][0][0], 512 * 4));
return(0);
}
......@@ -39,7 +39,7 @@
#include "prach_extern.h"
#if (LTE_RRC_VERSION < MAKE_VERSION(14, 0, 0))
#define rx_prach0 rx_prach
#define rx_prach0 rx_prach
#endif
void rx_prach0(PHY_VARS_eNB *eNB,
......@@ -53,11 +53,8 @@ void rx_prach0(PHY_VARS_eNB *eNB,
,uint8_t br_flag,
uint8_t ce_level
#endif
)
{
) {
int i;
LTE_DL_FRAME_PARMS *fp;
lte_frame_type_t frame_type;
uint16_t rootSequenceIndex;
......@@ -69,7 +66,6 @@ void rx_prach0(PHY_VARS_eNB *eNB,
int16_t *prachF=NULL;
int16_t **rxsigF=NULL;
int nb_rx;
int16_t *prach2;
uint8_t preamble_index;
uint16_t NCS,NCS2;
......@@ -93,26 +89,23 @@ void rx_prach0(PHY_VARS_eNB *eNB,
int16_t levdB;
int fft_size,log2_ifft_size;
int16_t prach_ifft_tmp[2048*2] __attribute__((aligned(32)));
int32_t *prach_ifft=(int32_t*)NULL;
int32_t *prach_ifft=(int32_t *)NULL;
int32_t **prach_ifftp=(int32_t **)NULL;
#if (LTE_RRC_VERSION >= MAKE_VERSION(14, 0, 0))
int prach_ifft_cnt=0;
#endif
if (ru) {
fp = &ru->frame_parms;
nb_rx = ru->nb_rx;
}
else if (eNB) {
} else if (eNB) {
fp = &eNB->frame_parms;
nb_rx = fp->nb_antennas_rx;
}
else AssertFatal(1==0,"rx_prach called without valid RU or eNB descriptor\n");
} else AssertFatal(1==0,"rx_prach called without valid RU or eNB descriptor\n");
frame_type = fp->frame_type;
#if (LTE_RRC_VERSION >= MAKE_VERSION(14, 0, 0))
if (br_flag == 1) {
AssertFatal(fp->prach_emtc_config_common.prach_Config_enabled==1,
"emtc prach_Config is not enabled\n");
......@@ -129,8 +122,7 @@ void rx_prach0(PHY_VARS_eNB *eNB,
max_preamble += ce_level;
max_preamble_energy += ce_level;
max_preamble_delay += ce_level;
}
else
} else
#endif
{
rootSequenceIndex = fp->prach_config_common.rootSequenceIndex;
......@@ -148,13 +140,16 @@ void rx_prach0(PHY_VARS_eNB *eNB,
if (eNB) {
#if (LTE_RRC_VERSION >= MAKE_VERSION(14, 0, 0))
if (br_flag == 1) {
prach_ifftp = eNB->prach_vars_br.prach_ifft[ce_level];
subframe = eNB->proc.subframe_prach_br;
prachF = eNB->prach_vars_br.prachF;
rxsigF = eNB->prach_vars_br.rxsigF[ce_level];
if (LOG_DEBUGFLAG(PRACH)){
if (((ru->proc.frame_prach)&1023) < 20) LOG_I(PHY,"PRACH (eNB) : running rx_prach (br_flag %d, ce_level %d) for frame %d subframe %d, prach_FreqOffset %d, prach_ConfigIndex %d, rootSequenceIndex %d, repetition number %d,numRepetitionsPrePreambleAttempt %d\n",
if (LOG_DEBUGFLAG(PRACH)) {
if (((ru->proc.frame_prach)&1023) < 20) LOG_I(PHY,
"PRACH (eNB) : running rx_prach (br_flag %d, ce_level %d) for frame %d subframe %d, prach_FreqOffset %d, prach_ConfigIndex %d, rootSequenceIndex %d, repetition number %d,numRepetitionsPrePreambleAttempt %d\n",
br_flag,ce_level,ru->proc.frame_prach,subframe,
fp->prach_emtc_config_common.prach_ConfigInfo.prach_FreqOffset[ce_level],
prach_ConfigIndex,rootSequenceIndex,
......@@ -168,17 +163,20 @@ void rx_prach0(PHY_VARS_eNB *eNB,
subframe = eNB->proc.subframe_prach;
prachF = eNB->prach_vars.prachF;
rxsigF = eNB->prach_vars.rxsigF[0];
if (LOG_DEBUGFLAG(PRACH)){
if (((ru->proc.frame_prach)&1023) < 20) LOG_I(PHY,"PRACH (eNB) : running rx_prach for subframe %d, prach_FreqOffset %d, prach_ConfigIndex %d , rootSequenceIndex %d\n", subframe,fp->prach_config_common.prach_ConfigInfo.prach_FreqOffset,prach_ConfigIndex,rootSequenceIndex);
}
if (LOG_DEBUGFLAG(PRACH)) {
if (((ru->proc.frame_prach)&1023) < 20) LOG_I(PHY,"PRACH (eNB) : running rx_prach for subframe %d, prach_FreqOffset %d, prach_ConfigIndex %d , rootSequenceIndex %d\n", subframe,
fp->prach_config_common.prach_ConfigInfo.prach_FreqOffset,prach_ConfigIndex,rootSequenceIndex);
}
}
else {
} else {
#if (LTE_RRC_VERSION >= MAKE_VERSION(14, 0, 0))
if (br_flag == 1) {
subframe = ru->proc.subframe_prach_br;
rxsigF = ru->prach_rxsigF_br[ce_level];
if (LOG_DEBUGFLAG(PRACH)){
if (LOG_DEBUGFLAG(PRACH)) {
if (((ru->proc.frame_prach)&1023) < 20) LOG_I(PHY,"PRACH (RU) : running rx_prach (br_flag %d, ce_level %d) for frame %d subframe %d, prach_FreqOffset %d, prach_ConfigIndex %d\n",
br_flag,ce_level,ru->proc.frame_prach,subframe,fp->prach_emtc_config_common.prach_ConfigInfo.prach_FreqOffset[ce_level],prach_ConfigIndex);
}
......@@ -187,12 +185,12 @@ void rx_prach0(PHY_VARS_eNB *eNB,
{
subframe = ru->proc.subframe_prach;
rxsigF = ru->prach_rxsigF;
if (LOG_DEBUGFLAG(PRACH)){
if (LOG_DEBUGFLAG(PRACH)) {
if (((ru->proc.frame_prach)&1023) < 20) LOG_I(PHY,"PRACH (RU) : running rx_prach for subframe %d, prach_FreqOffset %d, prach_ConfigIndex %d\n",
subframe,fp->prach_config_common.prach_ConfigInfo.prach_FreqOffset,prach_ConfigIndex);
}
}
}
AssertFatal(ru!=NULL,"ru is null\n");
......@@ -200,21 +198,25 @@ void rx_prach0(PHY_VARS_eNB *eNB,
for (aa=0; aa<nb_rx; aa++) {
if (ru->if_south == LOCAL_RF) { // set the time-domain signal if we have to use it in this node
// DJP - indexing below in subframe zero takes us off the beginning of the array???
prach[aa] = (int16_t*)&ru->common.rxdata[aa][(subframe*fp->samples_per_tti)-ru->N_TA_offset];
prach[aa] = (int16_t *)&ru->common.rxdata[aa][(subframe*fp->samples_per_tti)-ru->N_TA_offset];
if (LOG_DUMPFLAG(PRACH)){
int32_t en0=signal_energy((int32_t*)prach[aa],fp->samples_per_tti);
if (LOG_DUMPFLAG(PRACH)) {
int32_t en0=signal_energy((int32_t *)prach[aa],fp->samples_per_tti);
int8_t dbEn0 = dB_fixed(en0);
int8_t rach_dBm = dbEn0 - ru->rx_total_gain_dB;
char buffer[80];
if (dbEn0>32 && prach[0]!= NULL) {
static int counter=0;
sprintf(buffer, "%s%d", "/tmp/prach_rx",counter);
LOG_M(buffer,"prach_rx",prach[0],fp->samples_per_tti,1,13);
}
if (dB_fixed(en0)>32) {
sprintf(buffer, "rach_dBm:%d",rach_dBm);
if (prach[0]!= NULL) LOG_M("prach_rx","prach_rx",prach[0],fp->samples_per_tti,1,1);
LOG_I(PHY,"RU %d, br_flag %d ce_level %d frame %d subframe %d per_tti:%d prach:%p (energy %d) TA:%d %s rxdata:%p index:%d\n",
ru->idx,br_flag,ce_level,ru->proc.frame_prach,subframe,fp->samples_per_tti,
prach[aa],dbEn0,ru->N_TA_offset,buffer,ru->common.rxdata[aa],
......@@ -237,9 +239,7 @@ void rx_prach0(PHY_VARS_eNB *eNB,
if (eNB) start_meas(&eNB->rx_prach);
prach_root_sequence_map = (prach_fmt < 4) ? prach_root_sequence_map0_3 : prach_root_sequence_map4;
// PDP is oversampled, e.g. 1024 sample instead of 839
// Adapt the NCS (zero-correlation zones) with oversampling factor e.g. 1024/839
NCS2 = (N_ZC==839) ? ((NCS<<10)/839) : ((NCS<<8)/139);
......@@ -295,16 +295,17 @@ void rx_prach0(PHY_VARS_eNB *eNB,
case 100:
if (fp->threequarter_fs == 1)
Ncp=(Ncp*3)>>2;
break;
}
if (((eNB!=NULL) && (ru->function != NGFI_RAU_IF4p5))||
((eNB==NULL) && (ru->function == NGFI_RRU_IF4p5))) { // compute the DFTs of the PRACH temporal resources
// Do forward transform
if (LOG_DEBUGFLAG(PRACH)) {
LOG_D(PHY,"rx_prach: Doing FFT for N_RB_UL %d nb_rx:%d Ncp:%d\n",fp->N_RB_UL, nb_rx, Ncp);
}
for (aa=0; aa<nb_rx; aa++) {
AssertFatal(prach[aa]!=NULL,"prach[%d] is null\n",aa);
prach2 = prach[aa] + (Ncp<<1);
......@@ -412,43 +413,36 @@ void rx_prach0(PHY_VARS_eNB *eNB,
//LOG_D(PHY,"Shifting prach_rxF from %d to 0\n",k);
if ((k+(839*2)) > dftsize_x2) { // PRACH signal is split around DC
memmove((void*)&rxsigF[aa][dftsize_x2-k],(void*)&rxsigF[aa][0],(k+(839*2)-dftsize_x2)*2);
memmove((void*)&rxsigF[aa][0],(void*)(&rxsigF[aa][k]),(dftsize_x2-k)*2);
memmove((void *)&rxsigF[aa][dftsize_x2-k],(void *)&rxsigF[aa][0],(k+(839*2)-dftsize_x2)*2);
memmove((void *)&rxsigF[aa][0],(void *)(&rxsigF[aa][k]),(dftsize_x2-k)*2);
} else // PRACH signal is not split around DC
memmove((void *)&rxsigF[aa][0],(void *)(&rxsigF[aa][k]),839*4);
}
else // PRACH signal is not split around DC
memmove((void*)&rxsigF[aa][0],(void*)(&rxsigF[aa][k]),839*4);
}
}
if ((eNB==NULL) && (ru!=NULL) && ru->function == NGFI_RRU_IF4p5) {
if ((eNB==NULL) && ru->function == NGFI_RRU_IF4p5) {
/// **** send_IF4 of rxsigF to RAU **** ///
#if (LTE_RRC_VERSION >= MAKE_VERSION(14, 0, 0))
if (br_flag == 1) send_IF4p5(ru, ru->proc.frame_prach, ru->proc.subframe_prach, IF4p5_PRACH+1+ce_level);
else
#endif
send_IF4p5(ru, ru->proc.frame_prach, ru->proc.subframe_prach, IF4p5_PRACH);
return;
} else if (eNB!=NULL) {
if ( LOG_DEBUGFLAG(PRACH)) {
int en = dB_fixed(signal_energy((int32_t*)&rxsigF[0][0],840));
int en = dB_fixed(signal_energy((int32_t *)&rxsigF[0][0],840));
if ((en > 60)&&(br_flag==1)) LOG_I(PHY,"PRACH (br_flag %d,ce_level %d, n_ra_prb %d, k %d): Frame %d, Subframe %d => %d dB\n",br_flag,ce_level,n_ra_prb,k,eNB->proc.frame_rx,eNB->proc.subframe_rx,en);
}
}
// in case of RAU and prach received rx_thread wakes up prach
// here onwards is for eNodeB_3GPP or NGFI_RAU_IF4p5
preamble_offset_old = 99;
uint8_t update_TA = 4;
uint8_t update_TA2 = 1;
switch (eNB->frame_parms.N_RB_DL) {
case 6:
update_TA = 16;
......@@ -465,18 +459,22 @@ void rx_prach0(PHY_VARS_eNB *eNB,
case 75:
update_TA = 3;
update_TA2 = 2;
break;
case 100:
update_TA = 1;
break;
}
*max_preamble_energy=0;
for (preamble_index=0 ; preamble_index<64 ; preamble_index++) {
if (LOG_DEBUGFLAG(PRACH)) {
int en = dB_fixed(signal_energy((int32_t *)&rxsigF[0][0],840));
if (LOG_DEBUGFLAG(PRACH)){
int en = dB_fixed(signal_energy((int32_t*)&rxsigF[0][0],840));
if (en>60) LOG_I(PHY,"frame %d, subframe %d : Trying preamble %d (br_flag %d)\n",ru->proc.frame_prach,subframe,preamble_index,br_flag);
}
if (restricted_set == 0) {
// This is the relative offset in the root sequence table (5.7.2-4 from 36.211) for the given preamble index
preamble_offset = ((NCS==0)? preamble_index : (preamble_index/(N_ZC/NCS)));
......@@ -486,9 +484,7 @@ void rx_prach0(PHY_VARS_eNB *eNB,
new_dft = 1;
// This is the \nu corresponding to the preamble index
preamble_shift = 0;
}
else {
} else {
preamble_shift -= NCS;
if (preamble_shift < 0)
......@@ -519,7 +515,6 @@ void rx_prach0(PHY_VARS_eNB *eNB,
}
u = prach_root_sequence_map[index];
uint16_t n_group_ra = 0;
if ( (du[u]<(N_ZC/3)) && (du[u]>=NCS) ) {
......@@ -560,46 +555,51 @@ void rx_prach0(PHY_VARS_eNB *eNB,
// Compute DFT of RX signal (conjugate input, results in conjugate output) for each new rootSequenceIndex
if (LOG_DEBUGFLAG(PRACH)) {
int en = dB_fixed(signal_energy((int32_t*)&rxsigF[0][0],840));
int en = dB_fixed(signal_energy((int32_t *)&rxsigF[0][0],840));
if (en>60) LOG_I(PHY,"frame %d, subframe %d : preamble index %d: offset %d, preamble shift %d (br_flag %d, en %d)\n",
ru->proc.frame_prach,subframe,preamble_index,preamble_offset,preamble_shift,br_flag,en);
}
log2_ifft_size = 10;
fft_size = 6144;
if (new_dft == 1) {
new_dft = 0;
#if (LTE_RRC_VERSION >= MAKE_VERSION(14, 0, 0))
if (br_flag == 1) {
Xu=(int16_t*)eNB->X_u_br[ce_level][preamble_offset-first_nonzero_root_idx];
Xu=(int16_t *)eNB->X_u_br[ce_level][preamble_offset-first_nonzero_root_idx];
prach_ifft = prach_ifftp[prach_ifft_cnt++];
if (eNB->prach_vars_br.repetition_number[ce_level]==1) memset(prach_ifft,0,((N_ZC==839)?2048:256)*sizeof(int32_t));
}
else
} else
#endif
{
Xu=(int16_t*)eNB->X_u[preamble_offset-first_nonzero_root_idx];
Xu=(int16_t *)eNB->X_u[preamble_offset-first_nonzero_root_idx];
prach_ifft = prach_ifftp[0];
memset(prach_ifft,0,((N_ZC==839) ? 2048 : 256)*sizeof(int32_t));
}
memset(prachF, 0, sizeof(int16_t)*2*1024 );
if (LOG_DUMPFLAG(PRACH)) {
if (prach[0]!= NULL) LOG_M("prach_rx0.m","prach_rx0",prach[0],6144+792,1,1);
LOG_M("prach_rx1.m","prach_rx1",prach[1],6144+792,1,1);
LOG_M("prach_rxF0.m","prach_rxF0",rxsigF[0],24576,1,1);
LOG_M("prach_rxF1.m","prach_rxF1",rxsigF[1],6144,1,1);
}
for (aa=0;aa<nb_rx; aa++) {
for (aa=0; aa<nb_rx; aa++) {
// Do componentwise product with Xu* on each antenna
k=0;
for (offset=0; offset<(N_ZC<<1); offset+=2) {
prachF[offset] = (int16_t)(((int32_t)Xu[offset]*rxsigF[aa][k] + (int32_t)Xu[offset+1]*rxsigF[aa][k+1])>>15);
prachF[offset+1] = (int16_t)(((int32_t)Xu[offset]*rxsigF[aa][k+1] - (int32_t)Xu[offset+1]*rxsigF[aa][k])>>15);
k+=2;
if (k==(12*2*fp->ofdm_symbol_size))
k=0;
}
......@@ -608,19 +608,22 @@ void rx_prach0(PHY_VARS_eNB *eNB,
if (N_ZC == 839) {
log2_ifft_size = 10;
idft1024(prachF,prach_ifft_tmp,1);
// compute energy and accumulate over receive antennas and repetitions for BR
for (i=0;i<2048;i++)
for (i=0; i<2048; i++)
prach_ifft[i] += (prach_ifft_tmp[i<<1]*prach_ifft_tmp[i<<1] + prach_ifft_tmp[1+(i<<1)]*prach_ifft_tmp[1+(i<<1)])>>10;
} else {
idft256(prachF,prach_ifft_tmp,1);
log2_ifft_size = 8;
// compute energy and accumulate over receive antennas and repetitions for BR
for (i=0;i<256;i++)
for (i=0; i<256; i++)
prach_ifft[i] += (prach_ifft_tmp[i<<1]*prach_ifft_tmp[(i<<1)] + prach_ifft_tmp[1+(i<<1)]*prach_ifft_tmp[1+(i<<1)])>>10;
}
if (LOG_DUMPFLAG(PRACH)) {
if (aa==0) LOG_M("prach_rxF_comp0.m","prach_rxF_comp0",prachF,1024,1,1);
if (aa==1) LOG_M("prach_rxF_comp1.m","prach_rxF_comp1",prachF,1024,1,1);
}
}// antennas_rx
......@@ -628,17 +631,19 @@ void rx_prach0(PHY_VARS_eNB *eNB,
// check energy in nth time shift, for
#if (LTE_RRC_VERSION >= MAKE_VERSION(14, 0, 0))
if ((br_flag==0) ||
(eNB->prach_vars_br.repetition_number[ce_level]==
eNB->frame_parms.prach_emtc_config_common.prach_ConfigInfo.prach_numRepetitionPerPreambleAttempt[ce_level]))
#endif
{
if (LOG_DEBUGFLAG(PRACH)){
int en = dB_fixed(signal_energy((int32_t*)&rxsigF[0][0],840));
if (LOG_DEBUGFLAG(PRACH)) {
int en = dB_fixed(signal_energy((int32_t *)&rxsigF[0][0],840));
if (en>60) LOG_I(PHY,"frame %d, subframe %d: Checking for peak in time-domain (br_flag %d, en %d)\n",ru->proc.frame_prach,subframe,br_flag,en);
}
preamble_shift2 = ((preamble_shift==0) ? 0 : ((preamble_shift<<log2_ifft_size)/N_ZC));
preamble_shift2 = ((preamble_shift==0) ? 0 : ((preamble_shift<<log2_ifft_size)/N_ZC));
for (i=0; i<NCS2; i++) {
lev = (int32_t)prach_ifft[(preamble_shift2+i)];
......@@ -648,8 +653,10 @@ void rx_prach0(PHY_VARS_eNB *eNB,
*max_preamble_energy = levdB;
*max_preamble_delay = ((i*fft_size)>>log2_ifft_size)*update_TA/update_TA2;
*max_preamble = preamble_index;
if (LOG_DEBUGFLAG(PRACH)){
int en = dB_fixed(signal_energy((int32_t*)&rxsigF[0][0],840));
if (LOG_DEBUGFLAG(PRACH)) {
int en = dB_fixed(signal_energy((int32_t *)&rxsigF[0][0],840));
if ((en>60) && (br_flag==1))
LOG_D(PHY,"frame %d, subframe %d : max_preamble_energy %d, max_preamble_delay %d, max_preamble %d (br_flag %d,ce_level %d, levdB %d, lev %d)\n",
ru->proc.frame_prach,subframe,
......@@ -658,12 +665,12 @@ void rx_prach0(PHY_VARS_eNB *eNB,
}
}
}
}
}// preamble_index
if (LOG_DUMPFLAG(PRACH)) {
int en = dB_fixed(signal_energy((int32_t*)&rxsigF[0][0],840));
int en = dB_fixed(signal_energy((int32_t *)&rxsigF[0][0],840));
if (en>60) {
k = (12*n_ra_prb) - 6*fp->N_RB_UL;
......@@ -678,8 +685,7 @@ void rx_prach0(PHY_VARS_eNB *eNB,
LOG_M("prach_rxF_comp0.m","prach_rxF_comp0",prachF,1024,1,1);
LOG_M("Xu.m","xu",Xu,N_ZC,1,1);
LOG_M("prach_ifft0.m","prach_t0",prach_ifft,1024,1,1);
}
else {
} else {
LOG_E(PHY,"Dumping prach (br_flag %d), k = %d (n_ra_prb %d)\n",br_flag,k,n_ra_prb);
LOG_M("rxsigF_br.m","prach_rxF_br",&rxsigF[0][0],12288,1,1);
LOG_M("prach_rxF_comp0_br.m","prach_rxF_comp0_br",prachF,1024,1,1);
......@@ -687,11 +693,10 @@ void rx_prach0(PHY_VARS_eNB *eNB,
LOG_M("prach_ifft0_br.m","prach_t0_br",prach_ifft,1024,1,1);
exit(-1);
}
}
} /* LOG_DUMPFLAG(PRACH) */
if (eNB) stop_meas(&eNB->rx_prach);
if (eNB) stop_meas(&eNB->rx_prach);
}
#if (LTE_RRC_VERSION >= MAKE_VERSION(14, 0, 0))
......@@ -704,16 +709,15 @@ void rx_prach(PHY_VARS_eNB *eNB,
uint16_t Nf,
uint8_t tdd_mapindex,
uint8_t br_flag) {
int i;
int prach_mask=0;
if (br_flag == 0) {
rx_prach0(eNB,ru,max_preamble,max_preamble_energy,max_preamble_delay,Nf,tdd_mapindex,0,0);
}
else { // This is procedure for eMTC, basically handling the repetitions
} else { // This is procedure for eMTC, basically handling the repetitions
prach_mask = is_prach_subframe(&eNB->frame_parms,eNB->proc.frame_prach_br,eNB->proc.subframe_prach_br);
for (i=0;i<4;i++) {
for (i=0; i<4; i++) {
if ((eNB->frame_parms.prach_emtc_config_common.prach_ConfigInfo.prach_CElevel_enable[i]==1) &&
((prach_mask&(1<<(i+1))) > 0)) { // check that prach CE level is active now
......@@ -722,14 +726,12 @@ void rx_prach(PHY_VARS_eNB *eNB,
// increment repetition number
eNB->prach_vars_br.repetition_number[i]++;
// do basic PRACH reception
rx_prach0(eNB,ru,max_preamble,max_preamble_energy,max_preamble_delay,Nf,tdd_mapindex,1,i);
// if last repetition, clear counter
if (eNB->prach_vars_br.repetition_number[i] == eNB->frame_parms.prach_emtc_config_common.prach_ConfigInfo.prach_numRepetitionPerPreambleAttempt[i]) {
eNB->prach_vars_br.repetition_number[i]=0;
}
}
}
......
Markdown is supported
0%
or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment