/*
 * Licensed to the OpenAirInterface (OAI) Software Alliance under one or more
 * contributor license agreements.  See the NOTICE file distributed with
 * this work for additional information regarding copyright ownership.
 * The OpenAirInterface Software Alliance licenses this file to You under
 * the OAI Public License, Version 1.0  (the "License"); you may not use this file
 * except in compliance with the License.
 * You may obtain a copy of the License at
 *
 *      http://www.openairinterface.org/?page_id=698
 *
 * Unless required by applicable law or agreed to in writing, software
 * distributed under the License is distributed on an "AS IS" BASIS,
 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 * See the License for the specific language governing permissions and
 * limitations under the License.
 *-------------------------------------------------------------------------------
 * For more information about the OpenAirInterface (OAI) Software Alliance:
 *      contact@openairinterface.org
 */

/* file: 3gpplte_turbo_decoder_sse.c
   purpose: Routines for implementing max-logmap decoding of Turbo-coded (DLSCH) transport channels from 36-212, V8.6 2009-03
   authors: raymond.knopp@eurecom.fr, Laurent Thomas (Alcatel-Lucent)
   date: 21.10.2009

   Note: This routine currently requires SSE2,SSSE3 and SSE4.1 equipped computers.  It uses 16-bit inputs for LLRs and 8-bit arithmetic for internal computations!

   Changelog: 17.11.2009 FK SSE4.1 not required anymore
   Aug. 2012 new parallelization options for higher speed (8-way parallelization)
   Jan. 2013 8-bit LLR support with 16-way parallelization
   Feb. 2013 New interleaving and hard-decision optimizations (L. Thomas)
   May 2013 Extracted 8-bit code
*/

///
///

#include "PHY/sse_intrin.h"

#ifndef TEST_DEBUG
#include "PHY/defs.h"
#include "PHY/CODING/defs.h"
#include "PHY/CODING/lte_interleaver_inline.h"
#include "extern_3GPPinterleaver.h"
#else

#include "defs.h"
#include <stdio.h>
#include <stdlib.h>
#include <string.h>
#endif

#ifdef MEX
#include "mex.h"
#endif

#define SHUFFLE16(a,b,c,d,e,f,g,h) _mm_set_epi8(h==-1?-1:h*2+1, \
            h==-1?-1:h*2, \
            g==-1?-1:g*2+1, \
            g==-1?-1:g*2, \
            f==-1?-1:f*2+1, \
            f==-1?-1:f*2, \
            e==-1?-1:e*2+1, \
            e==-1?-1:e*2, \
            d==-1?-1:d*2+1, \
            d==-1?-1:d*2, \
            c==-1?-1:c*2+1, \
            c==-1?-1:c*2, \
            b==-1?-1:b*2+1, \
            b==-1?-1:b*2, \
            a==-1?-1:a*2+1, \
            a==-1?-1:a*2);





//#define DEBUG_LOGMAP



typedef int8_t llr_t; // internal decoder LLR data is 8-bit fixed
typedef int8_t channel_t;
#define MAX8 127


void log_map8(llr_t* systematic,channel_t* y_parity, llr_t* m11, llr_t* m10, llr_t *alpha, llr_t *beta, llr_t* ext,unsigned short frame_length,unsigned char term_flag,unsigned char F,int offset8_flag,
              time_stats_t *alpha_stats,time_stats_t *beta_stats,time_stats_t *gamma_stats,time_stats_t *ext_stats);
void compute_gamma8(llr_t* m11,llr_t* m10,llr_t* systematic, channel_t* y_parity, unsigned short frame_length,unsigned char term_flag);
void compute_alpha8(llr_t*alpha,llr_t *beta, llr_t* m11,llr_t* m10, unsigned short frame_length,unsigned char F);
void compute_beta8(llr_t*alpha, llr_t* beta,llr_t* m11,llr_t* m10, unsigned short frame_length,unsigned char F,int offset8_flag);
void compute_ext8(llr_t* alpha,llr_t* beta,llr_t* m11,llr_t* m10,llr_t* extrinsic, llr_t* ap, unsigned short frame_length);


void print_bytes(char *s, int8_t *x)
{


  printf("%s  : %d,%d,%d,%d,%d,%d,%d,%d,%d,%d,%d,%d,%d,%d,%d,%d\n",s,
         x[0],x[1],x[2],x[3],x[4],x[5],x[6],x[7],
         x[8],x[9],x[10],x[11],x[12],x[13],x[14],x[15]);

}


void log_map8(llr_t* systematic,
              channel_t* y_parity,
              llr_t* m11,
              llr_t* m10,
              llr_t *alpha,
              llr_t *beta,
              llr_t* ext,
              unsigned short frame_length,
              unsigned char term_flag,
              unsigned char F,
              int offset8_flag,
              time_stats_t *alpha_stats,
              time_stats_t *beta_stats,
              time_stats_t *gamma_stats,
              time_stats_t *ext_stats)
{

#ifdef DEBUG_LOGMAP
  msg("log_map, frame_length %d\n",frame_length);
#endif

  if (gamma_stats) start_meas(gamma_stats) ;
  compute_gamma8(m11,m10,systematic,y_parity,frame_length,term_flag) ;
  if (gamma_stats) stop_meas(gamma_stats);
  if (alpha_stats) start_meas(alpha_stats) ;
  compute_alpha8(alpha,beta,m11,m10,frame_length,F)                  ;
  if (alpha_stats) stop_meas(alpha_stats);
  if (beta_stats) start_meas(beta_stats)  ;
  compute_beta8(alpha,beta,m11,m10,frame_length,F,offset8_flag)      ;
  if (beta_stats) stop_meas(beta_stats);
  if (ext_stats) start_meas(ext_stats)   ;
  compute_ext8(alpha,beta,m11,m10,ext,systematic,frame_length)       ;
  if (ext_stats) stop_meas(ext_stats);


}

void compute_gamma8(llr_t* m11,llr_t* m10,llr_t* systematic,channel_t* y_parity,
                    unsigned short frame_length,unsigned char term_flag)
{
  int k,K1;
#if defined(__x86_64__)||defined(__i386__)
  __m128i *systematic128 = (__m128i *)systematic;
  __m128i *y_parity128   = (__m128i *)y_parity;
  __m128i *m10_128        = (__m128i *)m10;
  __m128i *m11_128        = (__m128i *)m11;
#elif defined(__arm__)
  int8x16_t *systematic128  = (int8x16_t *)systematic;
  int8x16_t *y_parity128    = (int8x16_t *)y_parity;
  int8x16_t *m10_128        = (int8x16_t *)m10;
  int8x16_t *m11_128        = (int8x16_t *)m11;
#endif

#ifdef DEBUG_LOGMAP
  msg("compute_gamma, %p,%p,%p,%p,framelength %d\n",m11,m10,systematic,y_parity,frame_length);
#endif

#if defined(__x86_64__) || defined(__i386__)
  register __m128i sl,sh,ypl,yph; //K128=_mm_set1_epi8(-128);
#endif
  K1 = (frame_length>>4);

  for (k=0; k<K1; k++) {
#if defined(__x86_64__) || defined(__i386__)
    sl  = _mm_cvtepi8_epi16(systematic128[k]);
    sh  = _mm_cvtepi8_epi16(_mm_srli_si128(systematic128[k],8));
    ypl = _mm_cvtepi8_epi16(y_parity128[k]);
    yph = _mm_cvtepi8_epi16(_mm_srli_si128(y_parity128[k],8));
    m11_128[k] = _mm_packs_epi16(_mm_srai_epi16(_mm_adds_epi16(sl,ypl),1),
                                 _mm_srai_epi16(_mm_adds_epi16(sh,yph),1));
    m10_128[k] = _mm_packs_epi16(_mm_srai_epi16(_mm_subs_epi16(sl,ypl),1),
                                 _mm_srai_epi16(_mm_subs_epi16(sh,yph),1));
#elif defined(__arm__)
    m11_128[k] = vhaddq_s8(systematic128[k],y_parity128[k]);
    m10_128[k] = vhsubq_s8(systematic128[k],y_parity128[k]);
#endif

  }

  // Termination

#if defined(__x86_64__) || defined(__i386__)
  sl  = _mm_cvtepi8_epi16(systematic128[k+term_flag]);
  sh = _mm_cvtepi8_epi16(_mm_srli_si128(systematic128[k],8));
  ypl = _mm_cvtepi8_epi16(y_parity128[k+term_flag]);
  yph = _mm_cvtepi8_epi16(_mm_srli_si128(y_parity128[k],8));
  m11_128[k] = _mm_packs_epi16(_mm_srai_epi16(_mm_adds_epi16(sl,ypl),1),
                               _mm_srai_epi16(_mm_adds_epi16(sh,yph),1));
  m10_128[k] = _mm_packs_epi16(_mm_srai_epi16(_mm_subs_epi16(sl,ypl),1),
                               _mm_srai_epi16(_mm_subs_epi16(sh,yph),1));
#elif defined(__arm__)
  m11_128[k] = vhaddq_s8(systematic128[k+term_flag],y_parity128[k]);
  m10_128[k] = vhsubq_s8(systematic128[k+term_flag],y_parity128[k]);
#endif

}

#define L 16

void compute_alpha8(llr_t* alpha,llr_t* beta,llr_t* m_11,llr_t* m_10,unsigned short frame_length,unsigned char F)
{
  int k,loopval,rerun_flag;

#if defined(__x86_64__) || defined(__i386__)
  __m128i *alpha128=(__m128i *)alpha,*alpha_ptr;
  __m128i *m11p,*m10p;
  __m128i m_b0,m_b1,m_b2,m_b3,m_b4,m_b5,m_b6,m_b7;
  __m128i new0,new1,new2,new3,new4,new5,new6,new7;
  __m128i alpha_max;
#elif defined(__arm__)
  int8x16_t *alpha128=(int8x16_t *)alpha,*alpha_ptr;
  int8x16_t *m11p,*m10p;
  int8x16_t m_b0,m_b1,m_b2,m_b3,m_b4,m_b5,m_b6,m_b7;
  int8x16_t new0,new1,new2,new3,new4,new5,new6,new7;
  int8x16_t alpha_max;
#endif
  // Set initial state: first colum is known
  // the other columns are unknown, so all states are set to same value

#if defined(__x86_64__) || defined(__i386__)
  alpha128[0] = _mm_set_epi8(-MAX8/2,-MAX8/2,-MAX8/2,-MAX8/2,-MAX8/2,-MAX8/2,-MAX8/2,-MAX8/2,-MAX8/2,-MAX8/2,-MAX8/2,-MAX8/2,-MAX8/2,-MAX8/2,-MAX8/2,0);
  alpha128[1] = _mm_set_epi8(-MAX8/2,-MAX8/2,-MAX8/2,-MAX8/2,-MAX8/2,-MAX8/2,-MAX8/2,-MAX8/2,-MAX8/2,-MAX8/2,-MAX8/2,-MAX8/2,-MAX8/2,-MAX8/2,-MAX8/2,-MAX8/2);
  alpha128[2] = _mm_set_epi8(-MAX8/2,-MAX8/2,-MAX8/2,-MAX8/2,-MAX8/2,-MAX8/2,-MAX8/2,-MAX8/2,-MAX8/2,-MAX8/2,-MAX8/2,-MAX8/2,-MAX8/2,-MAX8/2,-MAX8/2,-MAX8/2);
  alpha128[3] = _mm_set_epi8(-MAX8/2,-MAX8/2,-MAX8/2,-MAX8/2,-MAX8/2,-MAX8/2,-MAX8/2,-MAX8/2,-MAX8/2,-MAX8/2,-MAX8/2,-MAX8/2,-MAX8/2,-MAX8/2,-MAX8/2,-MAX8/2);
  alpha128[4] = _mm_set_epi8(-MAX8/2,-MAX8/2,-MAX8/2,-MAX8/2,-MAX8/2,-MAX8/2,-MAX8/2,-MAX8/2,-MAX8/2,-MAX8/2,-MAX8/2,-MAX8/2,-MAX8/2,-MAX8/2,-MAX8/2,-MAX8/2);
  alpha128[5] = _mm_set_epi8(-MAX8/2,-MAX8/2,-MAX8/2,-MAX8/2,-MAX8/2,-MAX8/2,-MAX8/2,-MAX8/2,-MAX8/2,-MAX8/2,-MAX8/2,-MAX8/2,-MAX8/2,-MAX8/2,-MAX8/2,-MAX8/2);
  alpha128[6] = _mm_set_epi8(-MAX8/2,-MAX8/2,-MAX8/2,-MAX8/2,-MAX8/2,-MAX8/2,-MAX8/2,-MAX8/2,-MAX8/2,-MAX8/2,-MAX8/2,-MAX8/2,-MAX8/2,-MAX8/2,-MAX8/2,-MAX8/2);
  alpha128[7] = _mm_set_epi8(-MAX8/2,-MAX8/2,-MAX8/2,-MAX8/2,-MAX8/2,-MAX8/2,-MAX8/2,-MAX8/2,-MAX8/2,-MAX8/2,-MAX8/2,-MAX8/2,-MAX8/2,-MAX8/2,-MAX8/2,-MAX8/2);
  for (loopval=frame_length>>4, rerun_flag=0; rerun_flag<2; loopval=L, rerun_flag++) {

    alpha_ptr = &alpha128[0];

    m11p = (__m128i*)m_11;
    m10p = (__m128i*)m_10;

    for (k=0;  k<loopval;  k++) {
      m_b0 = _mm_adds_epi8(alpha_ptr[1],*m11p);  // m11
      m_b4 = _mm_subs_epi8(alpha_ptr[1],*m11p);  // m00=-m11
      m_b1 = _mm_subs_epi8(alpha_ptr[3],*m10p);  // m01=-m10
      m_b5 = _mm_adds_epi8(alpha_ptr[3],*m10p);  // m10
      m_b2 = _mm_adds_epi8(alpha_ptr[5],*m10p);  // m10
      m_b6 = _mm_subs_epi8(alpha_ptr[5],*m10p);  // m01=-m10
      m_b3 = _mm_subs_epi8(alpha_ptr[7],*m11p);  // m00=-m11
      m_b7 = _mm_adds_epi8(alpha_ptr[7],*m11p);  // m11

      new0 = _mm_subs_epi8(alpha_ptr[0],*m11p);  // m00=-m11
      new4 = _mm_adds_epi8(alpha_ptr[0],*m11p);  // m11
      new1 = _mm_adds_epi8(alpha_ptr[2],*m10p);  // m10
      new5 = _mm_subs_epi8(alpha_ptr[2],*m10p);  // m01=-m10
      new2 = _mm_subs_epi8(alpha_ptr[4],*m10p);  // m01=-m10
      new6 = _mm_adds_epi8(alpha_ptr[4],*m10p);  // m10
      new3 = _mm_adds_epi8(alpha_ptr[6],*m11p);  // m11
      new7 = _mm_subs_epi8(alpha_ptr[6],*m11p);  // m00=-m11

      alpha_ptr += 8;
      m11p++;
      m10p++;
      alpha_ptr[0] = _mm_max_epi8(m_b0,new0);
      alpha_ptr[1] = _mm_max_epi8(m_b1,new1);
      alpha_ptr[2] = _mm_max_epi8(m_b2,new2);
      alpha_ptr[3] = _mm_max_epi8(m_b3,new3);
      alpha_ptr[4] = _mm_max_epi8(m_b4,new4);
      alpha_ptr[5] = _mm_max_epi8(m_b5,new5);
      alpha_ptr[6] = _mm_max_epi8(m_b6,new6);
      alpha_ptr[7] = _mm_max_epi8(m_b7,new7);

      // compute and subtract maxima
      alpha_max = _mm_max_epi8(alpha_ptr[0],alpha_ptr[1]);
      alpha_max = _mm_max_epi8(alpha_max,alpha_ptr[2]);
      alpha_max = _mm_max_epi8(alpha_max,alpha_ptr[3]);
      alpha_max = _mm_max_epi8(alpha_max,alpha_ptr[4]);
      alpha_max = _mm_max_epi8(alpha_max,alpha_ptr[5]);
      alpha_max = _mm_max_epi8(alpha_max,alpha_ptr[6]);
      alpha_max = _mm_max_epi8(alpha_max,alpha_ptr[7]);

      alpha_ptr[0] = _mm_subs_epi8(alpha_ptr[0],alpha_max);
      alpha_ptr[1] = _mm_subs_epi8(alpha_ptr[1],alpha_max);
      alpha_ptr[2] = _mm_subs_epi8(alpha_ptr[2],alpha_max);
      alpha_ptr[3] = _mm_subs_epi8(alpha_ptr[3],alpha_max);
      alpha_ptr[4] = _mm_subs_epi8(alpha_ptr[4],alpha_max);
      alpha_ptr[5] = _mm_subs_epi8(alpha_ptr[5],alpha_max);
      alpha_ptr[6] = _mm_subs_epi8(alpha_ptr[6],alpha_max);
      alpha_ptr[7] = _mm_subs_epi8(alpha_ptr[7],alpha_max);
    }

    // Set intial state for next iteration from the last state
    // as acolum end states are the first states of the next column
    int K1= frame_length>>1;
    alpha128[0] = _mm_slli_si128(alpha128[K1],1);
    alpha128[1] = _mm_slli_si128(alpha128[1+K1],1);
    alpha128[2] = _mm_slli_si128(alpha128[2+K1],1);
    alpha128[3] = _mm_slli_si128(alpha128[3+K1],1);
    alpha128[4] = _mm_slli_si128(alpha128[4+K1],1);
    alpha128[5] = _mm_slli_si128(alpha128[5+K1],1);
    alpha128[6] = _mm_slli_si128(alpha128[6+K1],1);
    alpha128[7] = _mm_slli_si128(alpha128[7+K1],1);
    alpha[16] =  -MAX8/2;
    alpha[32] = -MAX8/2;
    alpha[48] = -MAX8/2;
    alpha[64] = -MAX8/2;
    alpha[80] = -MAX8/2;
    alpha[96] = -MAX8/2;
    alpha[112] = -MAX8/2;

  }
#elif defined(__arm__)
  alpha128[0] = vdupq_n_s8(-MAX8/2);
  alpha128[0] = vsetq_lane_s8(0,alpha128[0],0);
  alpha128[1] = vdupq_n_s8(-MAX8/2);
  alpha128[2] = vdupq_n_s8(-MAX8/2);
  alpha128[3] = vdupq_n_s8(-MAX8/2);
  alpha128[4] = vdupq_n_s8(-MAX8/2);
  alpha128[5] = vdupq_n_s8(-MAX8/2);
  alpha128[6] = vdupq_n_s8(-MAX8/2);
  alpha128[7] = vdupq_n_s8(-MAX8/2);
  for (loopval=frame_length>>4, rerun_flag=0; rerun_flag<2; loopval=L, rerun_flag++) {

    alpha_ptr = &alpha128[0];

    m11p = (int8x16_t*)m_11;
    m10p = (int8x16_t*)m_10;

    for (k=0;  k<loopval;  k++) {
      m_b0 = vqaddq_s8(alpha_ptr[1],*m11p);  // m11
      m_b4 = vqsubq_s8(alpha_ptr[1],*m11p);  // m00=-m11
      m_b1 = vqsubq_s8(alpha_ptr[3],*m10p);  // m01=-m10
      m_b5 = vqaddq_s8(alpha_ptr[3],*m10p);  // m10
      m_b2 = vqaddq_s8(alpha_ptr[5],*m10p);  // m10
      m_b6 = vqsubq_s8(alpha_ptr[5],*m10p);  // m01=-m10
      m_b3 = vqsubq_s8(alpha_ptr[7],*m11p);  // m00=-m11
      m_b7 = vqaddq_s8(alpha_ptr[7],*m11p);  // m11

      new0 = vqsubq_s8(alpha_ptr[0],*m11p);  // m00=-m11
      new4 = vqaddq_s8(alpha_ptr[0],*m11p);  // m11
      new1 = vqaddq_s8(alpha_ptr[2],*m10p);  // m10
      new5 = vqsubq_s8(alpha_ptr[2],*m10p);  // m01=-m10
      new2 = vqsubq_s8(alpha_ptr[4],*m10p);  // m01=-m10
      new6 = vqaddq_s8(alpha_ptr[4],*m10p);  // m10
      new3 = vqaddq_s8(alpha_ptr[6],*m11p);  // m11
      new7 = vqsubq_s8(alpha_ptr[6],*m11p);  // m00=-m11

      alpha_ptr += 8;
      m11p++;
      m10p++;
      alpha_ptr[0] = vmaxq_s8(m_b0,new0);
      alpha_ptr[1] = vmaxq_s8(m_b1,new1);
      alpha_ptr[2] = vmaxq_s8(m_b2,new2);
      alpha_ptr[3] = vmaxq_s8(m_b3,new3);
      alpha_ptr[4] = vmaxq_s8(m_b4,new4);
      alpha_ptr[5] = vmaxq_s8(m_b5,new5);
      alpha_ptr[6] = vmaxq_s8(m_b6,new6);
      alpha_ptr[7] = vmaxq_s8(m_b7,new7);

      // compute and subtract maxima
      alpha_max = vmaxq_s8(alpha_ptr[0],alpha_ptr[1]);
      alpha_max = vmaxq_s8(alpha_max,alpha_ptr[2]);
      alpha_max = vmaxq_s8(alpha_max,alpha_ptr[3]);
      alpha_max = vmaxq_s8(alpha_max,alpha_ptr[4]);
      alpha_max = vmaxq_s8(alpha_max,alpha_ptr[5]);
      alpha_max = vmaxq_s8(alpha_max,alpha_ptr[6]);
      alpha_max = vmaxq_s8(alpha_max,alpha_ptr[7]);

      alpha_ptr[0] = vqsubq_s8(alpha_ptr[0],alpha_max);
      alpha_ptr[1] = vqsubq_s8(alpha_ptr[1],alpha_max);
      alpha_ptr[2] = vqsubq_s8(alpha_ptr[2],alpha_max);
      alpha_ptr[3] = vqsubq_s8(alpha_ptr[3],alpha_max);
      alpha_ptr[4] = vqsubq_s8(alpha_ptr[4],alpha_max);
      alpha_ptr[5] = vqsubq_s8(alpha_ptr[5],alpha_max);
      alpha_ptr[6] = vqsubq_s8(alpha_ptr[6],alpha_max);
      alpha_ptr[7] = vqsubq_s8(alpha_ptr[7],alpha_max);
    }

    // Set intial state for next iteration from the last state
    // as a column end states are the first states of the next column
    int K1= frame_length>>1;
    alpha128[0] = (int8x16_t)vshlq_n_s64((int64x2_t)alpha128[K1],8);   alpha128[0] = vsetq_lane_s8(alpha[8],alpha128[0],7);
    alpha128[1] = (int8x16_t)vshlq_n_s64((int64x2_t)alpha128[1+K1],8); alpha128[1] = vsetq_lane_s8(alpha[24],alpha128[0],7);
    alpha128[2] = (int8x16_t)vshlq_n_s64((int64x2_t)alpha128[2+K1],8); alpha128[2] = vsetq_lane_s8(alpha[40],alpha128[0],7);
    alpha128[3] = (int8x16_t)vshlq_n_s64((int64x2_t)alpha128[3+K1],8); alpha128[3] = vsetq_lane_s8(alpha[56],alpha128[0],7);
    alpha128[4] = (int8x16_t)vshlq_n_s64((int64x2_t)alpha128[4+K1],8); alpha128[4] = vsetq_lane_s8(alpha[72],alpha128[0],7);
    alpha128[5] = (int8x16_t)vshlq_n_s64((int64x2_t)alpha128[5+K1],8); alpha128[5] = vsetq_lane_s8(alpha[88],alpha128[0],7);
    alpha128[6] = (int8x16_t)vshlq_n_s64((int64x2_t)alpha128[6+K1],8); alpha128[6] = vsetq_lane_s8(alpha[104],alpha128[0],7);
    alpha128[7] = (int8x16_t)vshlq_n_s64((int64x2_t)alpha128[7+K1],8); alpha128[7] = vsetq_lane_s8(alpha[120],alpha128[0],7);
    alpha[16] =  -MAX8/2;
    alpha[32] = -MAX8/2;
    alpha[48] = -MAX8/2;
    alpha[64] = -MAX8/2;
    alpha[80] = -MAX8/2;
    alpha[96] = -MAX8/2;
    alpha[112] = -MAX8/2;

  }
#endif


}


void compute_beta8(llr_t* alpha,llr_t* beta,llr_t *m_11,llr_t* m_10,unsigned short frame_length,unsigned char F,int offset8_flag)
{

  int k,rerun_flag, loopval;
#if defined(__x86_64__) || defined(__i386__)
  __m128i m11_128,m10_128;
  __m128i m_b0,m_b1,m_b2,m_b3,m_b4,m_b5,m_b6,m_b7;
  __m128i new0,new1,new2,new3,new4,new5,new6,new7;

  __m128i *beta128,*alpha128,*beta_ptr;
  __m128i beta_max;
#elif defined(__arm__)
  int8x16_t m11_128,m10_128;
  int8x16_t m_b0,m_b1,m_b2,m_b3,m_b4,m_b5,m_b6,m_b7;
  int8x16_t new0,new1,new2,new3,new4,new5,new6,new7;

  int8x16_t *beta128,*alpha128,*beta_ptr;
  int8x16_t beta_max;
#endif
  llr_t beta0,beta1;

  llr_t beta2,beta3,beta4,beta5,beta6,beta7;


#if 0
  int16_t m11,m10;
  int16_t beta0_16,beta1_16,beta2_16,beta3_16,beta4_16,beta5_16,beta6_16,beta7_16,beta0_2,beta1_2,beta2_2,beta3_2,beta_m;
  __m128i beta_16;
  // termination for beta initialization

  m11=(int16_t)m_11[2+frame_length];
  m10=(int16_t)m_10[2+frame_length];

  beta0 = -m11;//M0T_TERM;
  beta1 = m11;//M1T_TERM;
  m11=(int16_t)m_11[1+frame_length];
  m10=(int16_t)m_10[1+frame_length];

  beta0_2 = beta0-m11;//+M0T_TERM;
  beta1_2 = beta0+m11;//+M1T_TERM;
  beta2_2 = beta1+m10;//M2T_TERM;
  beta3_2 = beta1-m10;//+M3T_TERM;
  m11=(int16_t)m_11[frame_length];
  m10=(int16_t)m_10[frame_length];

  beta0_16 = beta0_2-m11;//+M0T_TERM;
  beta1_16 = beta0_2+m11;//+M1T_TERM;
  beta2_16 = beta1_2+m10;//+M2T_TERM;
  beta3_16 = beta1_2-m10;//+M3T_TERM;
  beta4_16 = beta2_2-m10;//+M4T_TERM;
  beta5_16 = beta2_2+m10;//+M5T_TERM;
  beta6_16 = beta3_2+m11;//+M6T_TERM;
  beta7_16 = beta3_2-m11;//+M7T_TERM;

  beta_m = (beta0_16>beta1_16) ? beta0_16 : beta1_16;
  beta_m = (beta_m>beta2_16) ? beta_m : beta2_16;
  beta_m = (beta_m>beta3_16) ? beta_m : beta3_16;
  beta_m = (beta_m>beta4_16) ? beta_m : beta4_16;
  beta_m = (beta_m>beta5_16) ? beta_m : beta5_16;
  beta_m = (beta_m>beta6_16) ? beta_m : beta6_16;
  beta_m = (beta_m>beta7_16) ? beta_m : beta7_16;

  beta0_16=beta0_16-beta_m;
  beta1_16=beta1_16-beta_m;
  beta2_16=beta2_16-beta_m;
  beta3_16=beta3_16-beta_m;
  beta4_16=beta4_16-beta_m;
  beta5_16=beta5_16-beta_m;
  beta6_16=beta6_16-beta_m;
  beta7_16=beta7_16-beta_m;

  beta_16 = _mm_set_epi16(beta7_16,beta6_16,beta5_16,beta4_16,beta3_16,beta2_16,beta1_16,beta0_16);
  beta_16 = _mm_packs_epi16(beta_16,beta_16);
  beta0 = _mm_extract_epi8(beta_16,0);
  beta1 = _mm_extract_epi8(beta_16,1);
  beta2 = _mm_extract_epi8(beta_16,2);
  beta3 = _mm_extract_epi8(beta_16,3);
  beta4 = _mm_extract_epi8(beta_16,4);
  beta5 = _mm_extract_epi8(beta_16,5);
  beta6 = _mm_extract_epi8(beta_16,6);
  beta7 = _mm_extract_epi8(beta_16,7);

#endif

  if (frame_length > 6144) {
    LOG_E(PHY,"compute_beta: frame_length %d\n",frame_length);
    return;
  }

  // we are supposed to run compute_alpha just before compute_beta
  // so the initial states of backward computation can be set from last value of alpha states (forward computation)

#if defined(__x86_64__) || defined(__i386__)
  beta_ptr   = (__m128i*)&beta[frame_length<<3];
  alpha128   = (__m128i*)&alpha[0];
#elif defined(__arm__)
  beta_ptr   = (int8x16_t*)&beta[frame_length<<3];
  alpha128   = (int8x16_t*)&alpha[0];
#endif
  beta_ptr[0] = alpha128[(frame_length>>1)];
  beta_ptr[1] = alpha128[1+(frame_length>>1)];
  beta_ptr[2] = alpha128[2+(frame_length>>1)];
  beta_ptr[3] = alpha128[3+(frame_length>>1)];
  beta_ptr[4] = alpha128[4+(frame_length>>1)];
  beta_ptr[5] = alpha128[5+(frame_length>>1)];
  beta_ptr[6] = alpha128[6+(frame_length>>1)];
  beta_ptr[7] = alpha128[7+(frame_length>>1)];

  int overlap = (frame_length>>4)> L ? (frame_length>>4)-L : 0 ;

  for (rerun_flag=0, loopval=0;
       rerun_flag<2 ;
       loopval=overlap,rerun_flag++) {

    if (offset8_flag==0) {
      // FIXME! beta0-beta7 are used uninitialized. FIXME!
      // workaround: init with 0
      beta0 = beta1 = beta2 = beta3 = beta4 = beta5 = beta6 = beta7 = 0;

#if defined(__x86_64__) || defined(__i386__)
      beta_ptr[0] = _mm_insert_epi8(beta_ptr[0],beta0,15);
      beta_ptr[1] = _mm_insert_epi8(beta_ptr[1],beta1,15);
      beta_ptr[2] = _mm_insert_epi8(beta_ptr[2],beta2,15);
      beta_ptr[3] = _mm_insert_epi8(beta_ptr[3],beta3,15);
      beta_ptr[4] = _mm_insert_epi8(beta_ptr[4],beta4,15);
      beta_ptr[5] = _mm_insert_epi8(beta_ptr[5],beta5,15);
      beta_ptr[6] = _mm_insert_epi8(beta_ptr[6],beta6,15);
      beta_ptr[7] = _mm_insert_epi8(beta_ptr[7],beta7,15);
#elif defined(__arm__)
      beta_ptr[0] = vsetq_lane_s8(beta0,beta_ptr[0],15);
      beta_ptr[1] = vsetq_lane_s8(beta1,beta_ptr[1],15);
      beta_ptr[2] = vsetq_lane_s8(beta2,beta_ptr[2],15);
      beta_ptr[3] = vsetq_lane_s8(beta3,beta_ptr[3],15);
      beta_ptr[4] = vsetq_lane_s8(beta4,beta_ptr[4],15);
      beta_ptr[5] = vsetq_lane_s8(beta5,beta_ptr[5],15);
      beta_ptr[6] = vsetq_lane_s8(beta6,beta_ptr[6],15);
      beta_ptr[7] = vsetq_lane_s8(beta7,beta_ptr[7],15);
#endif
    }

#if defined(__x86_64__) || defined(__i386__)
    beta_ptr = (__m128i*)&beta[frame_length<<3];
#elif defined(__arm__)
    beta_ptr = (int8x16_t*)&beta[frame_length<<3];
#endif
    for (k=(frame_length>>4)-1;
         k>=loopval;
         k--) {
#if defined(__x86_64__) || defined(__i386__)
      m11_128=((__m128i*)m_11)[k];
      m10_128=((__m128i*)m_10)[k];
      m_b0 = _mm_adds_epi8(beta_ptr[4],m11_128);  //m11
      m_b1 = _mm_subs_epi8(beta_ptr[4],m11_128);  //m00
      m_b2 = _mm_subs_epi8(beta_ptr[5],m10_128);  //m01
      m_b3 = _mm_adds_epi8(beta_ptr[5],m10_128);  //m10
      m_b4 = _mm_adds_epi8(beta_ptr[6],m10_128);  //m10
      m_b5 = _mm_subs_epi8(beta_ptr[6],m10_128);  //m01
      m_b6 = _mm_subs_epi8(beta_ptr[7],m11_128);  //m00
      m_b7 = _mm_adds_epi8(beta_ptr[7],m11_128);  //m11

      new0 = _mm_subs_epi8(beta_ptr[0],m11_128);  //m00
      new1 = _mm_adds_epi8(beta_ptr[0],m11_128);  //m11
      new2 = _mm_adds_epi8(beta_ptr[1],m10_128);  //m10
      new3 = _mm_subs_epi8(beta_ptr[1],m10_128);  //m01
      new4 = _mm_subs_epi8(beta_ptr[2],m10_128);  //m01
      new5 = _mm_adds_epi8(beta_ptr[2],m10_128);  //m10
      new6 = _mm_adds_epi8(beta_ptr[3],m11_128);  //m11
      new7 = _mm_subs_epi8(beta_ptr[3],m11_128);  //m00

      beta_ptr-=8;

      beta_ptr[0] = _mm_max_epi8(m_b0,new0);
      beta_ptr[1] = _mm_max_epi8(m_b1,new1);
      beta_ptr[2] = _mm_max_epi8(m_b2,new2);
      beta_ptr[3] = _mm_max_epi8(m_b3,new3);
      beta_ptr[4] = _mm_max_epi8(m_b4,new4);
      beta_ptr[5] = _mm_max_epi8(m_b5,new5);
      beta_ptr[6] = _mm_max_epi8(m_b6,new6);
      beta_ptr[7] = _mm_max_epi8(m_b7,new7);

      beta_max = _mm_max_epi8(beta_ptr[0],beta_ptr[1]);
      beta_max = _mm_max_epi8(beta_max   ,beta_ptr[2]);
      beta_max = _mm_max_epi8(beta_max   ,beta_ptr[3]);
      beta_max = _mm_max_epi8(beta_max   ,beta_ptr[4]);
      beta_max = _mm_max_epi8(beta_max   ,beta_ptr[5]);
      beta_max = _mm_max_epi8(beta_max   ,beta_ptr[6]);
      beta_max = _mm_max_epi8(beta_max   ,beta_ptr[7]);

      beta_ptr[0] = _mm_subs_epi8(beta_ptr[0],beta_max);
      beta_ptr[1] = _mm_subs_epi8(beta_ptr[1],beta_max);
      beta_ptr[2] = _mm_subs_epi8(beta_ptr[2],beta_max);
      beta_ptr[3] = _mm_subs_epi8(beta_ptr[3],beta_max);
      beta_ptr[4] = _mm_subs_epi8(beta_ptr[4],beta_max);
      beta_ptr[5] = _mm_subs_epi8(beta_ptr[5],beta_max);
      beta_ptr[6] = _mm_subs_epi8(beta_ptr[6],beta_max);
      beta_ptr[7] = _mm_subs_epi8(beta_ptr[7],beta_max);
#elif defined(__arm__)
      m11_128=((int8x16_t*)m_11)[k];
      m10_128=((int8x16_t*)m_10)[k];
      m_b0 = vqaddq_s8(beta_ptr[4],m11_128);  //m11
      m_b1 = vqsubq_s8(beta_ptr[4],m11_128);  //m00
      m_b2 = vqsubq_s8(beta_ptr[5],m10_128);  //m01
      m_b3 = vqaddq_s8(beta_ptr[5],m10_128);  //m10
      m_b4 = vqaddq_s8(beta_ptr[6],m10_128);  //m10
      m_b5 = vqsubq_s8(beta_ptr[6],m10_128);  //m01
      m_b6 = vqsubq_s8(beta_ptr[7],m11_128);  //m00
      m_b7 = vqaddq_s8(beta_ptr[7],m11_128);  //m11

      new0 = vqsubq_s8(beta_ptr[0],m11_128);  //m00
      new1 = vqaddq_s8(beta_ptr[0],m11_128);  //m11
      new2 = vqaddq_s8(beta_ptr[1],m10_128);  //m10
      new3 = vqsubq_s8(beta_ptr[1],m10_128);  //m01
      new4 = vqsubq_s8(beta_ptr[2],m10_128);  //m01
      new5 = vqaddq_s8(beta_ptr[2],m10_128);  //m10
      new6 = vqaddq_s8(beta_ptr[3],m11_128);  //m11
      new7 = vqsubq_s8(beta_ptr[3],m11_128);  //m00

      beta_ptr-=8;

      beta_ptr[0] = vmaxq_s8(m_b0,new0);
      beta_ptr[1] = vmaxq_s8(m_b1,new1);
      beta_ptr[2] = vmaxq_s8(m_b2,new2);
      beta_ptr[3] = vmaxq_s8(m_b3,new3);
      beta_ptr[4] = vmaxq_s8(m_b4,new4);
      beta_ptr[5] = vmaxq_s8(m_b5,new5);
      beta_ptr[6] = vmaxq_s8(m_b6,new6);
      beta_ptr[7] = vmaxq_s8(m_b7,new7);

      beta_max = vmaxq_s8(beta_ptr[0],beta_ptr[1]);
      beta_max = vmaxq_s8(beta_max   ,beta_ptr[2]);
      beta_max = vmaxq_s8(beta_max   ,beta_ptr[3]);
      beta_max = vmaxq_s8(beta_max   ,beta_ptr[4]);
      beta_max = vmaxq_s8(beta_max   ,beta_ptr[5]);
      beta_max = vmaxq_s8(beta_max   ,beta_ptr[6]);
      beta_max = vmaxq_s8(beta_max   ,beta_ptr[7]);

      beta_ptr[0] = vqsubq_s8(beta_ptr[0],beta_max);
      beta_ptr[1] = vqsubq_s8(beta_ptr[1],beta_max);
      beta_ptr[2] = vqsubq_s8(beta_ptr[2],beta_max);
      beta_ptr[3] = vqsubq_s8(beta_ptr[3],beta_max);
      beta_ptr[4] = vqsubq_s8(beta_ptr[4],beta_max);
      beta_ptr[5] = vqsubq_s8(beta_ptr[5],beta_max);
      beta_ptr[6] = vqsubq_s8(beta_ptr[6],beta_max);
      beta_ptr[7] = vqsubq_s8(beta_ptr[7],beta_max);
#endif
    }

    // Set intial state for next iteration from the last state
    // as column last states are the first states of the next column
    // The initial state of column 0 is coming from tail bits (to be computed)

#if defined(__x86_64__) || defined(__i386__)
    beta128 = (__m128i*)&beta[0];
    beta_ptr   = (__m128i*)&beta[frame_length<<3];
    beta_ptr[0] = _mm_srli_si128(beta128[0],1);
    beta_ptr[1] = _mm_srli_si128(beta128[1],1);
    beta_ptr[2] = _mm_srli_si128(beta128[2],1);
    beta_ptr[3] = _mm_srli_si128(beta128[3],1);
    beta_ptr[4] = _mm_srli_si128(beta128[4],1);
    beta_ptr[5] = _mm_srli_si128(beta128[5],1);
    beta_ptr[6] = _mm_srli_si128(beta128[6],1);
    beta_ptr[7] = _mm_srli_si128(beta128[7],1);
#elif defined(__arm__)
    beta128 = (int8x16_t*)&beta[0];
    beta_ptr   = (int8x16_t*)&beta[frame_length<<3];
    beta_ptr[0] = (int8x16_t)vshrq_n_s64((int64x2_t)beta128[0],8);   beta_ptr[0] = vsetq_lane_s8(beta[7],beta_ptr[0],8);
    beta_ptr[1] = (int8x16_t)vshrq_n_s64((int64x2_t)beta128[1],8);   beta_ptr[1] = vsetq_lane_s8(beta[23],beta_ptr[1],8);
    beta_ptr[2] = (int8x16_t)vshrq_n_s64((int64x2_t)beta128[2],8);   beta_ptr[2] = vsetq_lane_s8(beta[39],beta_ptr[2],8);
    beta_ptr[3] = (int8x16_t)vshrq_n_s64((int64x2_t)beta128[3],8);   beta_ptr[3] = vsetq_lane_s8(beta[55],beta_ptr[3],8);
    beta_ptr[4] = (int8x16_t)vshrq_n_s64((int64x2_t)beta128[4],8);   beta_ptr[4] = vsetq_lane_s8(beta[71],beta_ptr[4],8);
    beta_ptr[5] = (int8x16_t)vshrq_n_s64((int64x2_t)beta128[5],8);   beta_ptr[5] = vsetq_lane_s8(beta[87],beta_ptr[5],8);
    beta_ptr[6] = (int8x16_t)vshrq_n_s64((int64x2_t)beta128[6],8);   beta_ptr[6] = vsetq_lane_s8(beta[103],beta_ptr[6],8);
    beta_ptr[7] = (int8x16_t)vshrq_n_s64((int64x2_t)beta128[7],8);   beta_ptr[7] = vsetq_lane_s8(beta[119],beta_ptr[7],8);
#endif
  }
}

void compute_ext8(llr_t* alpha,llr_t* beta,llr_t* m_11,llr_t* m_10,llr_t* ext, llr_t* systematic,unsigned short frame_length)
{

#if defined(__x86_64__) || defined(__i386__)
  __m128i *alpha128=(__m128i *)alpha;
  __m128i *beta128=(__m128i *)beta;
  __m128i *m11_128,*m10_128,*ext_128;
  __m128i *alpha_ptr,*beta_ptr;
  __m128i m00_1,m00_2,m00_3,m00_4;
  __m128i m01_1,m01_2,m01_3,m01_4;
  __m128i m10_1,m10_2,m10_3,m10_4;
  __m128i m11_1,m11_2,m11_3,m11_4;
#elif defined(__arm__)
  int8x16_t *alpha128=(int8x16_t *)alpha;
  int8x16_t *beta128=(int8x16_t *)beta;
  int8x16_t *m11_128,*m10_128,*ext_128;
  int8x16_t *alpha_ptr,*beta_ptr;
  int8x16_t m00_1,m00_2,m00_3,m00_4;
  int8x16_t m01_1,m01_2,m01_3,m01_4;
  int8x16_t m10_1,m10_2,m10_3,m10_4;
  int8x16_t m11_1,m11_2,m11_3,m11_4;
#endif
  int k;

  //
  // LLR computation, 8 consequtive bits per loop
  //

#ifdef DEBUG_LOGMAP
  msg("compute_ext, %p, %p, %p, %p, %p, %p ,framelength %d\n",alpha,beta,m_11,m_10,ext,systematic,frame_length);
#endif

  alpha_ptr = alpha128;
  beta_ptr = &beta128[8];


  for (k=0; k<(frame_length>>4); k++) {

#if defined(__x86_64__) || defined(__i386__)

    m11_128        = (__m128i*)&m_11[k<<4];
    m10_128        = (__m128i*)&m_10[k<<4];
    ext_128        = (__m128i*)&ext[k<<4];

    m00_4 = _mm_adds_epi8(alpha_ptr[7],beta_ptr[3]); //ALPHA_BETA_4m00;
    m11_4 = _mm_adds_epi8(alpha_ptr[7],beta_ptr[7]); //ALPHA_BETA_4m11;
    m00_3 = _mm_adds_epi8(alpha_ptr[6],beta_ptr[7]); //ALPHA_BETA_3m00;
    m11_3 = _mm_adds_epi8(alpha_ptr[6],beta_ptr[3]); //ALPHA_BETA_3m11;
    m00_2 = _mm_adds_epi8(alpha_ptr[1],beta_ptr[4]); //ALPHA_BETA_2m00;
    m11_2 = _mm_adds_epi8(alpha_ptr[1],beta_ptr[0]); //ALPHA_BETA_2m11;
    m11_1 = _mm_adds_epi8(alpha_ptr[0],beta_ptr[4]); //ALPHA_BETA_1m11;
    m00_1 = _mm_adds_epi8(alpha_ptr[0],beta_ptr[0]); //ALPHA_BETA_1m00;
    m01_4 = _mm_adds_epi8(alpha_ptr[5],beta_ptr[6]); //ALPHA_BETA_4m01;
    m10_4 = _mm_adds_epi8(alpha_ptr[5],beta_ptr[2]); //ALPHA_BETA_4m10;
    m01_3 = _mm_adds_epi8(alpha_ptr[4],beta_ptr[2]); //ALPHA_BETA_3m01;
    m10_3 = _mm_adds_epi8(alpha_ptr[4],beta_ptr[6]); //ALPHA_BETA_3m10;
    m01_2 = _mm_adds_epi8(alpha_ptr[3],beta_ptr[1]); //ALPHA_BETA_2m01;
    m10_2 = _mm_adds_epi8(alpha_ptr[3],beta_ptr[5]); //ALPHA_BETA_2m10;
    m10_1 = _mm_adds_epi8(alpha_ptr[2],beta_ptr[1]); //ALPHA_BETA_1m10;
    m01_1 = _mm_adds_epi8(alpha_ptr[2],beta_ptr[5]); //ALPHA_BETA_1m01;

    m01_1 = _mm_max_epi8(m01_1,m01_2);
    m01_1 = _mm_max_epi8(m01_1,m01_3);
    m01_1 = _mm_max_epi8(m01_1,m01_4);
    m00_1 = _mm_max_epi8(m00_1,m00_2);
    m00_1 = _mm_max_epi8(m00_1,m00_3);
    m00_1 = _mm_max_epi8(m00_1,m00_4);
    m10_1 = _mm_max_epi8(m10_1,m10_2);
    m10_1 = _mm_max_epi8(m10_1,m10_3);
    m10_1 = _mm_max_epi8(m10_1,m10_4);
    m11_1 = _mm_max_epi8(m11_1,m11_2);
    m11_1 = _mm_max_epi8(m11_1,m11_3);
    m11_1 = _mm_max_epi8(m11_1,m11_4);


    m01_1 = _mm_subs_epi8(m01_1,*m10_128);
    m00_1 = _mm_subs_epi8(m00_1,*m11_128);
    m10_1 = _mm_adds_epi8(m10_1,*m10_128);
    m11_1 = _mm_adds_epi8(m11_1,*m11_128);


    m01_1 = _mm_max_epi8(m01_1,m00_1);
    m10_1 = _mm_max_epi8(m10_1,m11_1);


    *ext_128 = _mm_subs_epi8(m10_1,m01_1);

    alpha_ptr+=8;
    beta_ptr+=8;
#elif defined(__arm__)

    m11_128        = (int8x16_t*)&m_11[k<<4];
    m10_128        = (int8x16_t*)&m_10[k<<4];
    ext_128        = (int8x16_t*)&ext[k<<4];

    m00_4 = vqaddq_s8(alpha_ptr[7],beta_ptr[3]); //ALPHA_BETA_4m00;
    m11_4 = vqaddq_s8(alpha_ptr[7],beta_ptr[7]); //ALPHA_BETA_4m11;
    m00_3 = vqaddq_s8(alpha_ptr[6],beta_ptr[7]); //ALPHA_BETA_3m00;
    m11_3 = vqaddq_s8(alpha_ptr[6],beta_ptr[3]); //ALPHA_BETA_3m11;
    m00_2 = vqaddq_s8(alpha_ptr[1],beta_ptr[4]); //ALPHA_BETA_2m00;
    m11_2 = vqaddq_s8(alpha_ptr[1],beta_ptr[0]); //ALPHA_BETA_2m11;
    m11_1 = vqaddq_s8(alpha_ptr[0],beta_ptr[4]); //ALPHA_BETA_1m11;
    m00_1 = vqaddq_s8(alpha_ptr[0],beta_ptr[0]); //ALPHA_BETA_1m00;
    m01_4 = vqaddq_s8(alpha_ptr[5],beta_ptr[6]); //ALPHA_BETA_4m01;
    m10_4 = vqaddq_s8(alpha_ptr[5],beta_ptr[2]); //ALPHA_BETA_4m10;
    m01_3 = vqaddq_s8(alpha_ptr[4],beta_ptr[2]); //ALPHA_BETA_3m01;
    m10_3 = vqaddq_s8(alpha_ptr[4],beta_ptr[6]); //ALPHA_BETA_3m10;
    m01_2 = vqaddq_s8(alpha_ptr[3],beta_ptr[1]); //ALPHA_BETA_2m01;
    m10_2 = vqaddq_s8(alpha_ptr[3],beta_ptr[5]); //ALPHA_BETA_2m10;
    m10_1 = vqaddq_s8(alpha_ptr[2],beta_ptr[1]); //ALPHA_BETA_1m10;
    m01_1 = vqaddq_s8(alpha_ptr[2],beta_ptr[5]); //ALPHA_BETA_1m01;

    m01_1 = vmaxq_s8(m01_1,m01_2);
    m01_1 = vmaxq_s8(m01_1,m01_3);
    m01_1 = vmaxq_s8(m01_1,m01_4);
    m00_1 = vmaxq_s8(m00_1,m00_2);
    m00_1 = vmaxq_s8(m00_1,m00_3);
    m00_1 = vmaxq_s8(m00_1,m00_4);
    m10_1 = vmaxq_s8(m10_1,m10_2);
    m10_1 = vmaxq_s8(m10_1,m10_3);
    m10_1 = vmaxq_s8(m10_1,m10_4);
    m11_1 = vmaxq_s8(m11_1,m11_2);
    m11_1 = vmaxq_s8(m11_1,m11_3);
    m11_1 = vmaxq_s8(m11_1,m11_4);


    m01_1 = vqsubq_s8(m01_1,*m10_128);
    m00_1 = vqsubq_s8(m00_1,*m11_128);
    m10_1 = vqaddq_s8(m10_1,*m10_128);
    m11_1 = vqaddq_s8(m11_1,*m11_128);


    m01_1 = vmaxq_s8(m01_1,m00_1);
    m10_1 = vmaxq_s8(m10_1,m11_1);


    *ext_128 = vqsubq_s8(m10_1,m01_1);

    alpha_ptr+=8;
    beta_ptr+=8;

#endif
  }


}



//int pi2[n],pi3[n+8],pi5[n+8],pi4[n+8],pi6[n+8],
int *pi2tab8[188],*pi5tab8[188],*pi4tab8[188],*pi6tab8[188];

void free_td8(void)
{
  int ind;

  for (ind=0; ind<188; ind++) {
    free(pi2tab8[ind]);
    free(pi5tab8[ind]);
    free(pi4tab8[ind]);
    free(pi6tab8[ind]);
  }
}

void init_td8()
{

  int ind,i,j,n,n2,pi,pi3;
  short * base_interleaver;

  for (ind=0; ind<188; ind++) {

    n = f1f2mat[ind].nb_bits;
    base_interleaver=il_tb+f1f2mat[ind].beg_index;
#ifdef MEX
    // This is needed for the Mex implementation to make the memory persistent
    pi2tab8[ind] = mxMalloc((n+8)*sizeof(int));
    pi5tab8[ind] = mxMalloc((n+8)*sizeof(int));
    pi4tab8[ind] = mxMalloc((n+8)*sizeof(int));
    pi6tab8[ind] = mxMalloc((n+8)*sizeof(int));
#else
    pi2tab8[ind] = malloc((n+8)*sizeof(int));
    pi5tab8[ind] = malloc((n+8)*sizeof(int));
    pi4tab8[ind] = malloc((n+8)*sizeof(int));
    pi6tab8[ind] = malloc((n+8)*sizeof(int));
#endif

    if ((n&15)>0) {
      n2 = n+8;
    } else
      n2 = n;

    for (j=0,i=0; i<n2; i++,j+=16) {

      if (j>=n2)
        j-=(n2-1);

      pi2tab8[ind][i] = j;
      //    printf("pi2[%d] = %d\n",i,j);
    }

    for (i=0; i<n2; i++) {
      pi = base_interleaver[i];//(unsigned int)threegpplte_interleaver(f1,f2,n);
      pi3 = pi2tab8[ind][pi];
      pi4tab8[ind][pi2tab8[ind][i]] = pi3;
      pi5tab8[ind][pi3] = pi2tab8[ind][i];
      pi6tab8[ind][pi] = pi2tab8[ind][i];
    }

  }
}

unsigned char phy_threegpplte_turbo_decoder8(short *y,
    unsigned char *decoded_bytes,
    unsigned short n,
    unsigned short f1,
    unsigned short f2,
    unsigned char max_iterations,
    unsigned char crc_type,
    unsigned char F,
    time_stats_t *init_stats,
    time_stats_t *alpha_stats,
    time_stats_t *beta_stats,
    time_stats_t *gamma_stats,
    time_stats_t *ext_stats,
    time_stats_t *intl1_stats,
    time_stats_t *intl2_stats)
{

  /*  y is a pointer to the input
      decoded_bytes is a pointer to the decoded output
      n is the size in bits of the coded block, with the tail */

  int n2;

  llr_t y8[3*(n+16)] __attribute__((aligned(16)));


  llr_t systematic0[n+16] __attribute__ ((aligned(16)));
  llr_t systematic1[n+16] __attribute__ ((aligned(16)));
  llr_t systematic2[n+16] __attribute__ ((aligned(16)));
  llr_t yparity1[n+16] __attribute__ ((aligned(16)));
  llr_t yparity2[n+16] __attribute__ ((aligned(16)));

  llr_t ext[n+128] __attribute__((aligned(16)));
  llr_t ext2[n+128] __attribute__((aligned(16)));

  llr_t alpha[(n+16)*8] __attribute__ ((aligned(16)));
  llr_t beta[(n+16)*8] __attribute__ ((aligned(16)));
  llr_t m11[n+16] __attribute__ ((aligned(16)));
  llr_t m10[n+16] __attribute__ ((aligned(16)));


  //  int *pi2_p,*pi4_p,*pi5_p,*pi6_p;
  int *pi4_p,*pi5_p,*pi6_p;
  llr_t *s,*s1,*s2,*yp1,*yp2,*yp;

  unsigned int i,j,iind;//,pi;
  unsigned char iteration_cnt=0;
  unsigned int crc,oldcrc,crc_len;
  uint8_t temp;
#if defined(__x86_64__) || defined(__i386__)
  __m128i *yp128;
  __m128i tmp128[(n+8)>>3];
  __m128i tmp, zeros=_mm_setzero_si128();
#elif defined(__arm__)
  int8x16_t *yp128;
  int8x16_t tmp128[(n+8)>>3];
  int8x16_t tmp, zeros=vdupq_n_s8(0);
  const uint8_t __attribute__ ((aligned (16))) _Powers[16]= 
    { 1, 2, 4, 8, 16, 32, 64, 128, 1, 2, 4, 8, 16, 32, 64, 128 };
  
  // Set the powers of 2 (do it once for all, if applicable)
  uint8x16_t Powers= vld1q_u8(_Powers);
#endif

  int offset8_flag=0;

  if (crc_type > 3) {
    msg("Illegal crc length!\n");
    return 255;
  }


  if (init_stats) start_meas(init_stats);


  if ((n&15)>0) {
    n2 = n+8;
    offset8_flag=1;
  } else
    n2 = n;


  for (iind=0; iind < 188 && f1f2mat[iind].nb_bits != n; iind++);

  if ( iind == 188 ) {
    msg("Illegal frame length!\n");
    return 255;
  }

  switch (crc_type) {
  case CRC24_A:
  case CRC24_B:
    crc_len=3;
    break;

  case CRC16:
    crc_len=2;
    break;

  case CRC8:
    crc_len=1;
    break;

  default:
    crc_len=3;
  }

#if defined(__x86_64__) || defined(__i386__)

  // note: this makes valgrind freak
  __m128i avg=_mm_set1_epi32(0);

  for (i=0; i<(3*(n>>4))+1; i++) {
    __m128i tmp=_mm_abs_epi16(_mm_unpackhi_epi16(((__m128i*)y)[i],((__m128i*)y)[i]));
    avg=_mm_add_epi32(_mm_cvtepi16_epi32(_mm_abs_epi16(((__m128i*)y)[i])),avg);
    avg=_mm_add_epi32(_mm_cvtepi16_epi32(tmp),avg);
  }

  int32_t round_avg=(_mm_extract_epi32(avg,0)+_mm_extract_epi32(avg,1)+_mm_extract_epi32(avg,2)+_mm_extract_epi32(avg,3))/(n*3);

  //printf("avg input turbo: %d sum %d taille bloc %d\n",round_avg,round_sum,n);

  if (round_avg < 16 )
    for (i=0,j=0; i<(3*(n2>>4))+1; i++,j+=2)
      ((__m128i *)y8)[i] = _mm_packs_epi16(((__m128i *)y)[j],((__m128i *)y)[j+1]);
  else if (round_avg < 32)
    for (i=0,j=0; i<(3*(n2>>4))+1; i++,j+=2)
      ((__m128i *)y8)[i] = _mm_packs_epi16(_mm_srai_epi16(((__m128i *)y)[j],1),_mm_srai_epi16(((__m128i *)y)[j+1],1));
  else if (round_avg < 64 )
    for (i=0,j=0; i<(3*(n2>>4))+1; i++,j+=2)
      ((__m128i *)y8)[i] = _mm_packs_epi16(_mm_srai_epi16(((__m128i *)y)[j],2),_mm_srai_epi16(((__m128i *)y)[j+1],2));
  else if (round_avg < 128)
    for (i=0,j=0; i<(3*(n2>>4))+1; i++,j+=2)
      ((__m128i *)y8)[i] = _mm_packs_epi16(_mm_srai_epi16(((__m128i *)y)[j],3),_mm_srai_epi16(((__m128i *)y)[j+1],3));
  else
    for (i=0,j=0; i<(3*(n2>>4))+1; i++,j+=2)
      ((__m128i *)y8)[i] = _mm_packs_epi16(_mm_srai_epi16(((__m128i *)y)[j],3),_mm_srai_epi16(((__m128i *)y)[j+1],4));

  yp128 = (__m128i*)y8;

#elif defined(__arm__)

  int32x4_t avg=vdupq_n_s32(0);

  for (i=0; i<(3*(n>>4))+1; i++) {
    int16x8_t tmp=vabsq_s16(((int16x8_t*)y)[i]);
    avg = vqaddq_s32(avg,vaddl_s16(((int16x4_t*)&tmp)[0],((int16x4_t*)&tmp)[1]));
  }

  int32_t round_avg=(vgetq_lane_s32(avg,0)+vgetq_lane_s32(avg,1)+vgetq_lane_s32(avg,2)+vgetq_lane_s32(avg,3))/(n*3);

  //printf("avg input turbo: %d sum %d taille bloc %d\n",round_avg,round_sum,n);

  if (round_avg < 16 )
    for (i=0,j=0; i<(3*(n2>>3))+1; i++,j+=2)
      ((int8x8_t *)y8)[i] = vqmovn_s16(((int16x8_t *)y)[j]);
  else if (round_avg < 32)
    for (i=0,j=0; i<(3*(n2>>3))+1; i++,j+=2)
      ((int8x8_t *)y8)[i] = vqmovn_s16(vshrq_n_s16(((int16x8_t *)y)[j],1));
  else if (round_avg < 64 )
    for (i=0,j=0; i<(3*(n2>>3))+1; i++,j+=2)
      ((int8x8_t *)y8)[i] = vqmovn_s16(vshrq_n_s16(((int16x8_t *)y)[j],2));
  else
    for (i=0,j=0; i<(3*(n2>>3))+1; i++,j+=2)
      ((int8x8_t *)y8)[i] = vqmovn_s16(vshrq_n_s16(((int16x8_t *)y)[j],3));

  yp128 = (int8x16_t*)y8;

#endif

  s = systematic0;
  s1 = systematic1;
  s2 = systematic2;
  yp1 = yparity1;
  yp2 = yparity2;
  yp=y8;
#if 1

  for (i=0; i<16 ; i++ )
    for (j=0; j<n2; j+=16) {
      int k=i+j;
      s[k]=*yp++;
      yp1[k]=*yp++;
      yp2[k]=*yp++;
    }

#endif
#if 0

  for (i=0; i<n2; i+=16) {
    pi2_p = &pi2tab8[iind][i];

    j=pi2_p[0];
#if defined(__x86_64__) || defined(__i386__)
    s[j]   = _mm_extract_epi8(yp128[0],0);
    yp1[j] = _mm_extract_epi8(yp128[0],1);
    yp2[j] = _mm_extract_epi8(yp128[0],2);


    j=pi2_p[1];
    s[j]   = _mm_extract_epi8(yp128[0],3);
    yp1[j] = _mm_extract_epi8(yp128[0],4);
    yp2[j] = _mm_extract_epi8(yp128[0],5);


    j=pi2_p[2];
    s[j]   = _mm_extract_epi8(yp128[0],6);
    yp1[j] = _mm_extract_epi8(yp128[0],7);
    yp2[j] = _mm_extract_epi8(yp128[0],8);


    j=pi2_p[3];
    s[j]   = _mm_extract_epi8(yp128[0],9);
    yp1[j] = _mm_extract_epi8(yp128[0],10);
    yp2[j] = _mm_extract_epi8(yp128[0],11);


    j=pi2_p[4];
    s[j]   = _mm_extract_epi8(yp128[0],12);
    yp1[j] = _mm_extract_epi8(yp128[0],13);
    yp2[j] = _mm_extract_epi8(yp128[0],14);


    j=pi2_p[5];
    s[j]   = _mm_extract_epi8(yp128[0],15);
    yp1[j] = _mm_extract_epi8(yp128[1],0);
    yp2[j] = _mm_extract_epi8(yp128[1],1);


    j=pi2_p[6];
    s[j]   = _mm_extract_epi8(yp128[1],2);
    yp1[j] = _mm_extract_epi8(yp128[1],3);
    yp2[j] = _mm_extract_epi8(yp128[1],4);


    j=pi2_p[7];
    s[j]   = _mm_extract_epi8(yp128[1],5);
    yp1[j] = _mm_extract_epi8(yp128[1],6);
    yp2[j] = _mm_extract_epi8(yp128[1],7);


    j=pi2_p[8];
    s[j]   = _mm_extract_epi8(yp128[1],8);
    yp1[j] = _mm_extract_epi8(yp128[1],9);
    yp2[j] = _mm_extract_epi8(yp128[1],10);


    j=pi2_p[9];
    s[j]   = _mm_extract_epi8(yp128[1],11);
    yp1[j] = _mm_extract_epi8(yp128[1],12);
    yp2[j] = _mm_extract_epi8(yp128[1],13);


    j=pi2_p[10];
    s[j]   = _mm_extract_epi8(yp128[1],14);
    yp1[j] = _mm_extract_epi8(yp128[1],15);
    yp2[j] = _mm_extract_epi8(yp128[2],0);


    j=pi2_p[11];
    s[j]   = _mm_extract_epi8(yp128[2],1);
    yp1[j] = _mm_extract_epi8(yp128[2],2);
    yp2[j] = _mm_extract_epi8(yp128[2],3);


    j=pi2_p[12];
    s[j]   = _mm_extract_epi8(yp128[2],4);
    yp1[j] = _mm_extract_epi8(yp128[2],5);
    yp2[j] = _mm_extract_epi8(yp128[2],6);


    j=pi2_p[13];
    s[j]   = _mm_extract_epi8(yp128[2],7);
    yp1[j] = _mm_extract_epi8(yp128[2],8);
    yp2[j] = _mm_extract_epi8(yp128[2],9);


    j=pi2_p[14];
    s[j]   = _mm_extract_epi8(yp128[2],10);
    yp1[j] = _mm_extract_epi8(yp128[2],11);
    yp2[j] = _mm_extract_epi8(yp128[2],12);


    j=pi2_p[15];
    s[j]   = _mm_extract_epi8(yp128[2],13);
    yp1[j] = _mm_extract_epi8(yp128[2],14);
    yp2[j] = _mm_extract_epi8(yp128[2],15);


#elif defined(__arm__)
    s[j]   = vgetq_lane_s8(yp128[0],0);
    yp1[j] = vgetq_lane_s8(yp128[0],1);
    yp2[j] = vgetq_lane_s8(yp128[0],2);


    j=pi2_p[1];
    s[j]   = vgetq_lane_s8(yp128[0],3);
    yp1[j] = vgetq_lane_s8(yp128[0],4);
    yp2[j] = vgetq_lane_s8(yp128[0],5);


    j=pi2_p[2];
    s[j]   = vgetq_lane_s8(yp128[0],6);
    yp1[j] = vgetq_lane_s8(yp128[0],7);
    yp2[j] = vgetq_lane_s8(yp128[0],8);


    j=pi2_p[3];
    s[j]   = vgetq_lane_s8(yp128[0],9);
    yp1[j] = vgetq_lane_s8(yp128[0],10);
    yp2[j] = vgetq_lane_s8(yp128[0],11);


    j=pi2_p[4];
    s[j]   = vgetq_lane_s8(yp128[0],12);
    yp1[j] = vgetq_lane_s8(yp128[0],13);
    yp2[j] = vgetq_lane_s8(yp128[0],14);


    j=pi2_p[5];
    s[j]   = vgetq_lane_s8(yp128[0],15);
    yp1[j] = vgetq_lane_s8(yp128[1],0);
    yp2[j] = vgetq_lane_s8(yp128[1],1);


    j=pi2_p[6];
    s[j]   = vgetq_lane_s8(yp128[1],2);
    yp1[j] = vgetq_lane_s8(yp128[1],3);
    yp2[j] = vgetq_lane_s8(yp128[1],4);


    j=pi2_p[7];
    s[j]   = vgetq_lane_s8(yp128[1],5);
    yp1[j] = vgetq_lane_s8(yp128[1],6);
    yp2[j] = vgetq_lane_s8(yp128[1],7);


    j=pi2_p[8];
    s[j]   = vgetq_lane_s8(yp128[1],8);
    yp1[j] = vgetq_lane_s8(yp128[1],9);
    yp2[j] = vgetq_lane_s8(yp128[1],10);


    j=pi2_p[9];
    s[j]   = vgetq_lane_s8(yp128[1],11);
    yp1[j] = vgetq_lane_s8(yp128[1],12);
    yp2[j] = vgetq_lane_s8(yp128[1],13);


    j=pi2_p[10];
    s[j]   = vgetq_lane_s8(yp128[1],14);
    yp1[j] = vgetq_lane_s8(yp128[1],15);
    yp2[j] = vgetq_lane_s8(yp128[2],0);


    j=pi2_p[11];
    s[j]   = vgetq_lane_s8(yp128[2],1);
    yp1[j] = vgetq_lane_s8(yp128[2],2);
    yp2[j] = vgetq_lane_s8(yp128[2],3);


    j=pi2_p[12];
    s[j]   = vgetq_lane_s8(yp128[2],4);
    yp1[j] = vgetq_lane_s8(yp128[2],5);
    yp2[j] = vgetq_lane_s8(yp128[2],6);


    j=pi2_p[13];
    s[j]   = vgetq_lane_s8(yp128[2],7);
    yp1[j] = vgetq_lane_s8(yp128[2],8);
    yp2[j] = vgetq_lane_s8(yp128[2],9);


    j=pi2_p[14];
    s[j]   = vgetq_lane_s8(yp128[2],10);
    yp1[j] = vgetq_lane_s8(yp128[2],11);
    yp2[j] = vgetq_lane_s8(yp128[2],12);


    j=pi2_p[15];
    s[j]   = vgetq_lane_s8(yp128[2],13);
    yp1[j] = vgetq_lane_s8(yp128[2],14);
    yp2[j] = vgetq_lane_s8(yp128[2],15);

#endif
    yp128+=3;

  }

#endif

  yp=(llr_t*)yp128;

  if (n2>n) {
    /*
    s[n]=0;s[n+1]=0;s[n+2]=0;s[n+3]=0;
    s[n+4]=0;s[n+5]=0;s[n+6]=0;s[n+7]=0;
    s1[n]=0;s1[n+1]=0;s1[n+2]=0;s1[n+3]=0;
    s1[n+4]=0;s1[n+5]=0;s1[n+6]=0;s1[n+7]=0;
    s2[n]=0;s2[n+1]=0;s2[n+2]=0;s2[n+3]=0;
    s2[n+4]=0;s2[n+5]=0;s2[n+6]=0;s2[n+7]=0;*/
    yp=(llr_t*)(y8+n);
  }

  //  printf("n=%d,n2=%d\n",n,n2);

  // Termination
  for (i=n2; i<n2+3; i++) {
    s[i]= *yp;
    s1[i] = s[i] ;
    s2[i] = s[i];
    yp++;
    yp1[i] = *yp;
    yp++;
#ifdef DEBUG_LOGMAP
    msg("Term 1 (%d): %d %d\n",i,s[i],yp1[i]);
#endif //DEBUG_LOGMAP
  }

  for (i=n2+16; i<n2+19; i++) {
    s[i]= *yp;
    s1[i] = s[i] ;
    s2[i] = s[i];
    yp++;
    yp2[i-16] = *yp;
    yp++;
#ifdef DEBUG_LOGMAP
    msg("Term 2 (%d): %d %d\n",i-16,s[i],yp2[i-16]);
#endif //DEBUG_LOGMAP
  }

#ifdef DEBUG_LOGMAP
  msg("\n");
#endif //DEBUG_LOGMAP

  if (init_stats) stop_meas(init_stats);

  // do log_map from first parity bit

  log_map8(systematic0,yparity1,m11,m10,alpha,beta,ext,n2,0,F,offset8_flag,alpha_stats,beta_stats,gamma_stats,ext_stats);

  while (iteration_cnt++ < max_iterations) {

#ifdef DEBUG_LOGMAP
    printf("\n*******************ITERATION %d (n %d, n2 %d), ext %p\n\n",iteration_cnt,n,n2,ext);
#endif //DEBUG_LOGMAP

    if (intl1_stats) start_meas(intl1_stats);
    pi4_p=pi4tab8[iind];

    for (i=0; i<(n2>>4); i++) { // steady-state portion
#if defined(__x86_64__) || defined(__i386__)
      tmp=_mm_insert_epi8(tmp,((llr_t*)ext)[*pi4_p++],0);
      tmp=_mm_insert_epi8(tmp,((llr_t*)ext)[*pi4_p++],1);
      tmp=_mm_insert_epi8(tmp,((llr_t*)ext)[*pi4_p++],2);
      tmp=_mm_insert_epi8(tmp,((llr_t*)ext)[*pi4_p++],3);
      tmp=_mm_insert_epi8(tmp,((llr_t*)ext)[*pi4_p++],4);
      tmp=_mm_insert_epi8(tmp,((llr_t*)ext)[*pi4_p++],5);
      tmp=_mm_insert_epi8(tmp,((llr_t*)ext)[*pi4_p++],6);
      tmp=_mm_insert_epi8(tmp,((llr_t*)ext)[*pi4_p++],7);
      tmp=_mm_insert_epi8(tmp,((llr_t*)ext)[*pi4_p++],8);
      tmp=_mm_insert_epi8(tmp,((llr_t*)ext)[*pi4_p++],9);
      tmp=_mm_insert_epi8(tmp,((llr_t*)ext)[*pi4_p++],10);
      tmp=_mm_insert_epi8(tmp,((llr_t*)ext)[*pi4_p++],11);
      tmp=_mm_insert_epi8(tmp,((llr_t*)ext)[*pi4_p++],12);
      tmp=_mm_insert_epi8(tmp,((llr_t*)ext)[*pi4_p++],13);
      tmp=_mm_insert_epi8(tmp,((llr_t*)ext)[*pi4_p++],14);
      ((__m128i *)systematic2)[i]=_mm_insert_epi8(tmp,((llr_t*)ext)[*pi4_p++],15);
#elif defined(__arm__)
      tmp=vsetq_lane_s8(((llr_t*)ext)[*pi4_p++],tmp,0);
      tmp=vsetq_lane_s8(((llr_t*)ext)[*pi4_p++],tmp,1);
      tmp=vsetq_lane_s8(((llr_t*)ext)[*pi4_p++],tmp,2);
      tmp=vsetq_lane_s8(((llr_t*)ext)[*pi4_p++],tmp,3);
      tmp=vsetq_lane_s8(((llr_t*)ext)[*pi4_p++],tmp,4);
      tmp=vsetq_lane_s8(((llr_t*)ext)[*pi4_p++],tmp,5);
      tmp=vsetq_lane_s8(((llr_t*)ext)[*pi4_p++],tmp,6);
      tmp=vsetq_lane_s8(((llr_t*)ext)[*pi4_p++],tmp,7);
      tmp=vsetq_lane_s8(((llr_t*)ext)[*pi4_p++],tmp,8);
      tmp=vsetq_lane_s8(((llr_t*)ext)[*pi4_p++],tmp,9);
      tmp=vsetq_lane_s8(((llr_t*)ext)[*pi4_p++],tmp,10);
      tmp=vsetq_lane_s8(((llr_t*)ext)[*pi4_p++],tmp,11);
      tmp=vsetq_lane_s8(((llr_t*)ext)[*pi4_p++],tmp,12);
      tmp=vsetq_lane_s8(((llr_t*)ext)[*pi4_p++],tmp,13);
      tmp=vsetq_lane_s8(((llr_t*)ext)[*pi4_p++],tmp,14);
      ((int8x16_t *)systematic2)[i]=vsetq_lane_s8(((llr_t*)ext)[*pi4_p++],tmp,15);
#endif
    }

    if (intl1_stats) stop_meas(intl1_stats);

    // do log_map from second parity bit

    log_map8(systematic2,yparity2,m11,m10,alpha,beta,ext2,n2,1,F,offset8_flag,alpha_stats,beta_stats,gamma_stats,ext_stats);



    pi5_p=pi5tab8[iind];
    uint16_t decoded_bytes_interl[6144/16] __attribute__((aligned(16)));

    if ((n2&0x7f) == 0) {  // n2 is a multiple of 128 bits
      for (i=0; i<(n2>>4); i++) {
#if defined(__x86_64__) || defined(__i386__)
        tmp=_mm_insert_epi8(tmp,ext2[*pi5_p++],0);
        tmp=_mm_insert_epi8(tmp,ext2[*pi5_p++],1);
        tmp=_mm_insert_epi8(tmp,ext2[*pi5_p++],2);
        tmp=_mm_insert_epi8(tmp,ext2[*pi5_p++],3);
        tmp=_mm_insert_epi8(tmp,ext2[*pi5_p++],4);
        tmp=_mm_insert_epi8(tmp,ext2[*pi5_p++],5);
        tmp=_mm_insert_epi8(tmp,ext2[*pi5_p++],6);
        tmp=_mm_insert_epi8(tmp,ext2[*pi5_p++],7);
        tmp=_mm_insert_epi8(tmp,ext2[*pi5_p++],8);
        tmp=_mm_insert_epi8(tmp,ext2[*pi5_p++],9);
        tmp=_mm_insert_epi8(tmp,ext2[*pi5_p++],10);
        tmp=_mm_insert_epi8(tmp,ext2[*pi5_p++],11);
        tmp=_mm_insert_epi8(tmp,ext2[*pi5_p++],12);
        tmp=_mm_insert_epi8(tmp,ext2[*pi5_p++],13);
        tmp=_mm_insert_epi8(tmp,ext2[*pi5_p++],14);
        tmp=_mm_insert_epi8(tmp,ext2[*pi5_p++],15);
        decoded_bytes_interl[i]=(uint16_t) _mm_movemask_epi8(_mm_cmpgt_epi8(tmp,zeros));
        ((__m128i *)systematic1)[i] = _mm_adds_epi8(_mm_subs_epi8(tmp,((__m128i*)ext)[i]),((__m128i *)systematic0)[i]);
#elif defined(__arm__)
        tmp=vsetq_lane_s8(ext2[*pi5_p++],tmp,0);
        tmp=vsetq_lane_s8(ext2[*pi5_p++],tmp,1);
        tmp=vsetq_lane_s8(ext2[*pi5_p++],tmp,2);
        tmp=vsetq_lane_s8(ext2[*pi5_p++],tmp,3);
        tmp=vsetq_lane_s8(ext2[*pi5_p++],tmp,4);
        tmp=vsetq_lane_s8(ext2[*pi5_p++],tmp,5);
        tmp=vsetq_lane_s8(ext2[*pi5_p++],tmp,6);
        tmp=vsetq_lane_s8(ext2[*pi5_p++],tmp,7);
        tmp=vsetq_lane_s8(ext2[*pi5_p++],tmp,8);
        tmp=vsetq_lane_s8(ext2[*pi5_p++],tmp,9);
        tmp=vsetq_lane_s8(ext2[*pi5_p++],tmp,10);
        tmp=vsetq_lane_s8(ext2[*pi5_p++],tmp,11);
        tmp=vsetq_lane_s8(ext2[*pi5_p++],tmp,12);
        tmp=vsetq_lane_s8(ext2[*pi5_p++],tmp,13);
        tmp=vsetq_lane_s8(ext2[*pi5_p++],tmp,14);
        tmp=vsetq_lane_s8(ext2[*pi5_p++],tmp,15);
	uint64x2_t Mask= vpaddlq_u32(vpaddlq_u16(vpaddlq_u8(vandq_u8(vcgtq_s8(tmp,zeros), Powers))));
	vst1q_lane_u8(&((uint8_t*)&decoded_bytes[i])[0], (uint8x16_t)Mask, 0);
	vst1q_lane_u8(&((uint8_t*)&decoded_bytes[i])[1], (uint8x16_t)Mask, 8);
	((int8x16_t *)systematic1)[i] = vqaddq_s8(vqsubq_s8(tmp,((int8x16_t*)ext)[i]),((int8x16_t *)systematic0)[i]);
#endif
      }

    } else {
      for (i=0; i<(n2>>4); i++) {
#if defined(__x86_64__) || defined(__i386__)
        tmp=_mm_insert_epi8(tmp,ext2[*pi5_p++],0);
        tmp=_mm_insert_epi8(tmp,ext2[*pi5_p++],1);
        tmp=_mm_insert_epi8(tmp,ext2[*pi5_p++],2);
        tmp=_mm_insert_epi8(tmp,ext2[*pi5_p++],3);
        tmp=_mm_insert_epi8(tmp,ext2[*pi5_p++],4);
        tmp=_mm_insert_epi8(tmp,ext2[*pi5_p++],5);
        tmp=_mm_insert_epi8(tmp,ext2[*pi5_p++],6);
        tmp=_mm_insert_epi8(tmp,ext2[*pi5_p++],7);
        tmp=_mm_insert_epi8(tmp,ext2[*pi5_p++],8);
        tmp=_mm_insert_epi8(tmp,ext2[*pi5_p++],9);
        tmp=_mm_insert_epi8(tmp,ext2[*pi5_p++],10);
        tmp=_mm_insert_epi8(tmp,ext2[*pi5_p++],11);
        tmp=_mm_insert_epi8(tmp,ext2[*pi5_p++],12);
        tmp=_mm_insert_epi8(tmp,ext2[*pi5_p++],13);
        tmp=_mm_insert_epi8(tmp,ext2[*pi5_p++],14);
        tmp=_mm_insert_epi8(tmp,ext2[*pi5_p++],15);
        tmp128[i] = _mm_adds_epi8(((__m128i *)ext2)[i],((__m128i *)systematic2)[i]);

        ((__m128i *)systematic1)[i] = _mm_adds_epi8(_mm_subs_epi8(tmp,((__m128i*)ext)[i]),((__m128i *)systematic0)[i]);
#elif defined(__arm__)
        tmp=vsetq_lane_s8(ext2[*pi5_p++],tmp,0);
        tmp=vsetq_lane_s8(ext2[*pi5_p++],tmp,1);
        tmp=vsetq_lane_s8(ext2[*pi5_p++],tmp,2);
        tmp=vsetq_lane_s8(ext2[*pi5_p++],tmp,3);
        tmp=vsetq_lane_s8(ext2[*pi5_p++],tmp,4);
        tmp=vsetq_lane_s8(ext2[*pi5_p++],tmp,5);
        tmp=vsetq_lane_s8(ext2[*pi5_p++],tmp,6);
        tmp=vsetq_lane_s8(ext2[*pi5_p++],tmp,7);
        tmp=vsetq_lane_s8(ext2[*pi5_p++],tmp,8);
        tmp=vsetq_lane_s8(ext2[*pi5_p++],tmp,9);
        tmp=vsetq_lane_s8(ext2[*pi5_p++],tmp,10);
        tmp=vsetq_lane_s8(ext2[*pi5_p++],tmp,11);
        tmp=vsetq_lane_s8(ext2[*pi5_p++],tmp,12);
        tmp=vsetq_lane_s8(ext2[*pi5_p++],tmp,13);
        tmp=vsetq_lane_s8(ext2[*pi5_p++],tmp,14);
        tmp=vsetq_lane_s8(ext2[*pi5_p++],tmp,15);
        tmp128[i] = vqaddq_s8(((int8x16_t *)ext2)[i],((int8x16_t *)systematic2)[i]);

        ((int8x16_t *)systematic1)[i] = vqaddq_s8(vqsubq_s8(tmp,((int8x16_t*)ext)[i]),((int8x16_t *)systematic0)[i]);

#endif 
     }
    }

    // Check if we decoded the block
    if (iteration_cnt>1) {
      if (intl2_stats) start_meas(intl2_stats);

      if ((n2&0x7f) == 0) {  // n2 is a multiple of 128 bits

        // re-order the decoded bits in theregular order
        // as it is presently ordered as 16 sequential columns
#if defined(__x86_64__) || defined(__i386__)
        __m128i* dbytes=(__m128i*)decoded_bytes_interl;
        __m128i shuffle=SHUFFLE16(7,6,5,4,3,2,1,0);
        __m128i mask  __attribute__((aligned(16)));
        int n_128=n2>>7;

        for (i=0; i<n_128; i++) {
          mask=_mm_set1_epi16(1);
          __m128i tmp __attribute__((aligned(16)));
          tmp=_mm_shuffle_epi8(dbytes[i],shuffle);
          __m128i tmp2 __attribute__((aligned(16))) ;

          tmp2=_mm_and_si128(tmp,mask);
          tmp2=_mm_cmpeq_epi16(tmp2,mask);
	  //	  printf("decoded_bytes %p\n",decoded_bytes);
          decoded_bytes[n_128*0+i]=(uint8_t) _mm_movemask_epi8(_mm_packs_epi16(tmp2,zeros));
          int j;

          for (j=1; j<16; j++) {
            mask=_mm_slli_epi16(mask,1);
            tmp2=_mm_and_si128(tmp,mask);
            tmp2=_mm_cmpeq_epi16(tmp2,mask);
            decoded_bytes[n_128*j +i]=(uint8_t) _mm_movemask_epi8(_mm_packs_epi16(tmp2,zeros));
          }
        }
#elif defined(__arm__)
        uint8x16_t* dbytes=(uint8x16_t*)decoded_bytes_interl;
        uint16x8_t mask  __attribute__((aligned(16)));
        int n_128=n2>>7;

        for (i=0; i<n_128; i++) {
          mask=vdupq_n_u16(1);
          uint8x16_t tmp __attribute__((aligned(16)));
          tmp=vcombine_u8(vrev64_u8(((uint8x8_t*)&dbytes[i])[1]),vrev64_u8(((uint8x8_t*)&dbytes[i])[0]));
          vst1q_lane_u8(&decoded_bytes[n_128*0+i],(uint8x16_t)vpaddlq_u32(vpaddlq_u16(vpaddlq_u8(vandq_u8(tmp, Powers)))),0);

          int j;

          for (j=1; j<16; j++) {
            mask=vshlq_n_u16(mask,1);
	    vst1q_lane_u8(&decoded_bytes[n_128*0+i],(uint8x16_t)vpaddlq_u32(vpaddlq_u16(vpaddlq_u8(vandq_u8(tmp, Powers)))),0);
          }
        }

#endif
      } else {
        pi6_p=pi6tab8[iind];

        for (i=0; i<(n2>>4); i++) {
#if defined(__x86_64__) || defined(__i386__)
          tmp=_mm_insert_epi8(tmp, ((llr_t *)tmp128)[*pi6_p++],7);
          tmp=_mm_insert_epi8(tmp, ((llr_t *)tmp128)[*pi6_p++],6);
          tmp=_mm_insert_epi8(tmp, ((llr_t *)tmp128)[*pi6_p++],5);
          tmp=_mm_insert_epi8(tmp, ((llr_t *)tmp128)[*pi6_p++],4);
          tmp=_mm_insert_epi8(tmp, ((llr_t *)tmp128)[*pi6_p++],3);
          tmp=_mm_insert_epi8(tmp, ((llr_t *)tmp128)[*pi6_p++],2);
          tmp=_mm_insert_epi8(tmp, ((llr_t *)tmp128)[*pi6_p++],1);
          tmp=_mm_insert_epi8(tmp, ((llr_t *)tmp128)[*pi6_p++],0);
          tmp=_mm_insert_epi8(tmp, ((llr_t *)tmp128)[*pi6_p++],15);
          tmp=_mm_insert_epi8(tmp, ((llr_t *)tmp128)[*pi6_p++],14);
          tmp=_mm_insert_epi8(tmp, ((llr_t *)tmp128)[*pi6_p++],13);
          tmp=_mm_insert_epi8(tmp, ((llr_t *)tmp128)[*pi6_p++],12);
          tmp=_mm_insert_epi8(tmp, ((llr_t *)tmp128)[*pi6_p++],11);
          tmp=_mm_insert_epi8(tmp, ((llr_t *)tmp128)[*pi6_p++],10);
          tmp=_mm_insert_epi8(tmp, ((llr_t *)tmp128)[*pi6_p++],9);
          tmp=_mm_insert_epi8(tmp, ((llr_t *)tmp128)[*pi6_p++],8);
          tmp=_mm_cmpgt_epi8(tmp,zeros);
          ((uint16_t *)decoded_bytes)[i]=(uint16_t)_mm_movemask_epi8(tmp);
#elif defined(__arm__)
          tmp=vsetq_lane_s8(((llr_t *)tmp128)[*pi6_p++],tmp,7);
          tmp=vsetq_lane_s8(((llr_t *)tmp128)[*pi6_p++],tmp,6);
          tmp=vsetq_lane_s8(((llr_t *)tmp128)[*pi6_p++],tmp,5);
          tmp=vsetq_lane_s8(((llr_t *)tmp128)[*pi6_p++],tmp,4);
          tmp=vsetq_lane_s8(((llr_t *)tmp128)[*pi6_p++],tmp,3);
          tmp=vsetq_lane_s8(((llr_t *)tmp128)[*pi6_p++],tmp,2);
          tmp=vsetq_lane_s8(((llr_t *)tmp128)[*pi6_p++],tmp,1);
          tmp=vsetq_lane_s8(((llr_t *)tmp128)[*pi6_p++],tmp,0);
          tmp=vsetq_lane_s8(((llr_t *)tmp128)[*pi6_p++],tmp,15);
          tmp=vsetq_lane_s8(((llr_t *)tmp128)[*pi6_p++],tmp,14);
          tmp=vsetq_lane_s8(((llr_t *)tmp128)[*pi6_p++],tmp,13);
          tmp=vsetq_lane_s8(((llr_t *)tmp128)[*pi6_p++],tmp,12);
          tmp=vsetq_lane_s8(((llr_t *)tmp128)[*pi6_p++],tmp,11);
          tmp=vsetq_lane_s8(((llr_t *)tmp128)[*pi6_p++],tmp,10);
          tmp=vsetq_lane_s8(((llr_t *)tmp128)[*pi6_p++],tmp,9);
          tmp=vsetq_lane_s8(((llr_t *)tmp128)[*pi6_p++],tmp,8);
	  uint64x2_t Mask= vpaddlq_u32(vpaddlq_u16(vpaddlq_u8(vandq_u8(vcgtq_s8(tmp,zeros), Powers))));
	  vst1q_lane_u8(&((uint8_t*)&decoded_bytes[i])[0], (uint8x16_t)Mask, 0);
	  vst1q_lane_u8(&((uint8_t*)&decoded_bytes[i])[1], (uint8x16_t)Mask, 8);
#endif
        }
      }

      // check the CRC
      oldcrc= *((unsigned int *)(&decoded_bytes[(n>>3)-crc_len]));

      switch (crc_type) {

      case CRC24_A:
        oldcrc&=0x00ffffff;
        crc = crc24a(&decoded_bytes[F>>3],
                     n-24-F)>>8;
        temp=((uint8_t *)&crc)[2];
        ((uint8_t *)&crc)[2] = ((uint8_t *)&crc)[0];
        ((uint8_t *)&crc)[0] = temp;
        break;

      case CRC24_B:
        oldcrc&=0x00ffffff;
        crc = crc24b(decoded_bytes,
                     n-24)>>8;
        temp=((uint8_t *)&crc)[2];
        ((uint8_t *)&crc)[2] = ((uint8_t *)&crc)[0];
        ((uint8_t *)&crc)[0] = temp;
        break;

      case CRC16:
        oldcrc&=0x0000ffff;
        crc = crc16(decoded_bytes,
                    n-16)>>16;
        break;

      case CRC8:
        oldcrc&=0x000000ff;
        crc = crc8(decoded_bytes,
                   n-8)>>24;
        break;

      default:
        printf("FATAL: 3gpplte_turbo_decoder_sse.c: Unknown CRC\n");
        return(255);
        break;
      }

      if (intl2_stats) stop_meas(intl2_stats);

      if (crc == oldcrc) {
        return(iteration_cnt);
      }
    }

    // do a new iteration if it is not yet decoded
    if (iteration_cnt < max_iterations) {
      log_map8(systematic1,yparity1,m11,m10,alpha,beta,ext,n2,0,F,offset8_flag,alpha_stats,beta_stats,gamma_stats,ext_stats);
#if defined(__x86_64__) || defined(__i386__)
      __m128i* ext_128=(__m128i*) ext;
      __m128i* s1_128=(__m128i*) systematic1;
      __m128i* s0_128=(__m128i*) systematic0;
#elif defined(__arm__)
      int8x16_t* ext_128=(int8x16_t*) ext;
      int8x16_t* s1_128=(int8x16_t*) systematic1;
      int8x16_t* s0_128=(int8x16_t*) systematic0;
#endif
      int myloop=n2>>4;

      for (i=0; i<myloop; i++) {
#if defined(__x86_64__) || defined(__i386__)
        *ext_128=_mm_adds_epi8(_mm_subs_epi8(*ext_128,*s1_128++),*s0_128++);
#elif defined(__arm__)
        *ext_128=vqaddq_s8(vqsubq_s8(*ext_128,*s1_128++),*s0_128++);
#endif
        ext_128++;
      }
    }
  }

  return(iteration_cnt);

}