/*
 * Licensed to the OpenAirInterface (OAI) Software Alliance under one or more
 * contributor license agreements.  See the NOTICE file distributed with
 * this work for additional information regarding copyright ownership.
 * The OpenAirInterface Software Alliance licenses this file to You under
 * the OAI Public License, Version 1.0  (the "License"); you may not use this file
 * except in compliance with the License.
 * You may obtain a copy of the License at
 *
 *      http://www.openairinterface.org/?page_id=698
 *
 * Unless required by applicable law or agreed to in writing, software
 * distributed under the License is distributed on an "AS IS" BASIS,
 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 * See the License for the specific language governing permissions and
 * limitations under the License.
 *-------------------------------------------------------------------------------
 * For more information about the OpenAirInterface (OAI) Software Alliance:
 *      contact@openairinterface.org
 */

/*! \file PHY/NR_UE_TRANSPORT/nr_dlsch_decoding.c
* \brief Top-level routines for decoding  Turbo-coded (DLSCH) transport channels from 36-212, V8.6 2009-03
* \author R. Knopp
* \date 2011
* \version 0.1
* \company Eurecom
* \email: knopp@eurecom.fr
* \note
* \warning
*/

#include "PHY/defs_nr_UE.h"
#include "PHY/phy_extern_nr_ue.h"
#include "PHY/CODING/coding_extern.h"
#include "PHY/CODING/coding_defs.h"
#include "PHY/NR_TRANSPORT/nr_transport_common_proto.h"
#include "PHY/NR_TRANSPORT/nr_dlsch.h"
//#include "SCHED/extern.h"
#include "SIMULATION/TOOLS/sim.h"
#include "targets/RT/USER/nr-uesoftmodem.h"
#include "PHY/CODING/nrLDPC_decoder/nrLDPC_decoder.h"
#include "PHY/CODING/nrLDPC_decoder/nrLDPC_types.h"
//#define DEBUG_DLSCH_DECODING

#define OAI_LDPC_MAX_NUM_LLR 27000//26112 // NR_LDPC_NCOL_BG1*NR_LDPC_ZMAX

static int8_t llrProcBuf[OAI_LDPC_MAX_NUM_LLR] __attribute__ ((aligned(32)));

static uint64_t nb_total_decod =0;
static uint64_t nb_error_decod =0;

//extern double cpuf;

void free_nr_ue_dlsch(NR_UE_DLSCH_t *dlsch)
{

  int i,r;

  if (dlsch) {
    for (i=0; i<dlsch->Mdlharq; i++) {
      if (dlsch->harq_processes[i]) {
        if (dlsch->harq_processes[i]->b) {
          free16(dlsch->harq_processes[i]->b,MAX_DLSCH_PAYLOAD_BYTES);
          dlsch->harq_processes[i]->b = NULL;
        }

        for (r=0; r<MAX_NUM_NR_DLSCH_SEGMENTS; r++) {
          free16(dlsch->harq_processes[i]->c[r],1056);
          dlsch->harq_processes[i]->c[r] = NULL;
        }

        for (r=0; r<MAX_NUM_NR_DLSCH_SEGMENTS; r++)
          if (dlsch->harq_processes[i]->d[r]) {
            free16(dlsch->harq_processes[i]->d[r],(3*8448)*sizeof(short));
            dlsch->harq_processes[i]->d[r] = NULL;
          }

        free16(dlsch->harq_processes[i],sizeof(NR_DL_UE_HARQ_t));
        dlsch->harq_processes[i] = NULL;
      }
    }

    free16(dlsch,sizeof(NR_UE_DLSCH_t));
    dlsch = NULL;
  }
}

NR_UE_DLSCH_t *new_nr_ue_dlsch(uint8_t Kmimo,uint8_t Mdlharq,uint32_t Nsoft,uint8_t max_ldpc_iterations,uint8_t N_RB_DL, uint8_t abstraction_flag)
{

  NR_UE_DLSCH_t *dlsch;
  uint8_t exit_flag = 0,i,r;

  unsigned char bw_scaling =1;

  switch (N_RB_DL) {
  case 6:
    bw_scaling =16;
    break;

  case 25:
    bw_scaling =4;
    break;

  case 50:
    bw_scaling =2;
    break;

  default:
    bw_scaling =1;
    break;
  }

  dlsch = (NR_UE_DLSCH_t *)malloc16(sizeof(NR_UE_DLSCH_t));

  if (dlsch) {
    memset(dlsch,0,sizeof(NR_UE_DLSCH_t));
    dlsch->Kmimo = Kmimo;
    dlsch->Mdlharq = Mdlharq;
    dlsch->Nsoft = Nsoft;
    dlsch->max_ldpc_iterations = max_ldpc_iterations;

    for (i=0; i<Mdlharq; i++) {
      //      printf("new_ue_dlsch: Harq process %d\n",i);
      dlsch->harq_processes[i] = (NR_DL_UE_HARQ_t *)malloc16(sizeof(NR_DL_UE_HARQ_t));

      if (dlsch->harq_processes[i]) {
        memset(dlsch->harq_processes[i],0,sizeof(NR_DL_UE_HARQ_t));
        dlsch->harq_processes[i]->first_tx=1;
        dlsch->harq_processes[i]->b = (uint8_t*)malloc16(MAX_DLSCH_PAYLOAD_BYTES/bw_scaling);

        if (dlsch->harq_processes[i]->b)
          memset(dlsch->harq_processes[i]->b,0,MAX_DLSCH_PAYLOAD_BYTES/bw_scaling);
        else
          exit_flag=3;

        if (abstraction_flag == 0) {
          for (r=0; r<MAX_NUM_DLSCH_SEGMENTS/bw_scaling; r++) {
            dlsch->harq_processes[i]->c[r] = (uint8_t*)malloc16(1056);

            if (dlsch->harq_processes[i]->c[r])
              memset(dlsch->harq_processes[i]->c[r],0,1056);
            else
              exit_flag=2;

            dlsch->harq_processes[i]->d[r] = (short*)malloc16((3*8448)*sizeof(short));

            if (dlsch->harq_processes[i]->d[r])
              memset(dlsch->harq_processes[i]->d[r],0,(3*8448)*sizeof(short));
            else
              exit_flag=2;
          }
        }
      } else {
        exit_flag=1;
      }
    }

    if (exit_flag==0)
      return(dlsch);
  }

  printf("new_ue_dlsch with size %zu: exit_flag = %u\n",sizeof(NR_DL_UE_HARQ_t), exit_flag);
  free_nr_ue_dlsch(dlsch);

  return(NULL);
}

void nr_dlsch_unscrambling(int16_t* llr,
                         uint8_t size,
                         uint8_t q,
                         uint32_t Nid,
                         uint32_t n_RNTI) {

  uint8_t reset;
  uint32_t x1, x2, s=0;

  reset = 1;
  x2 = (n_RNTI<<15) + (q<<14) + Nid;

  for (int i=0; i<size; i++) {
    if ((i&0x1f)==0) {
      s = lte_gold_generic(&x1, &x2, reset);
      reset = 0;
    }
    if (((s>>(i&0x1f))&1)==1)
        llr[i] = -llr[i];
  }

}

uint32_t  nr_dlsch_decoding(PHY_VARS_NR_UE *phy_vars_ue,
                         short *dlsch_llr,
                         NR_DL_FRAME_PARMS *frame_parms,
                         NR_UE_DLSCH_t *dlsch,
                         NR_DL_UE_HARQ_t *harq_process,
                         uint32_t frame,
						 uint16_t nb_symb_sch,
                         uint8_t nr_tti_rx,
                         uint8_t harq_pid,
                         uint8_t is_crnti,
                         uint8_t llr8_flag)
{

#if UE_TIMING_TRACE
  time_stats_t *dlsch_rate_unmatching_stats=&phy_vars_ue->dlsch_rate_unmatching_stats;
  time_stats_t *dlsch_turbo_decoding_stats=&phy_vars_ue->dlsch_turbo_decoding_stats;
  time_stats_t *dlsch_deinterleaving_stats=&phy_vars_ue->dlsch_deinterleaving_stats;
#endif
  uint32_t A,E;
  uint32_t G;
  uint32_t ret,offset;
  int32_t no_iteration_ldpc;
  //short dummy_w[MAX_NUM_DLSCH_SEGMENTS][3*(8448+64)];
  uint32_t r,r_offset=0,Kr=8424,Kr_bytes,K_bytes_F,err_flag=0;
  uint8_t crc_type;
  t_nrLDPC_dec_params decParams;
  t_nrLDPC_dec_params* p_decParams = &decParams;
  t_nrLDPC_time_stats procTime;
  t_nrLDPC_time_stats* p_procTime =&procTime ;
  int16_t z [68*384];
  int8_t l [68*384];
  //__m128i l;
  int16_t inv_d [68*384];
//  int16_t *p_invd =&inv_d;
  uint8_t kb, kc;
  uint8_t Ilbrm = 0;
  uint32_t Tbslbrm = 950984;
  uint16_t nb_rb = 30; //to update
  //uint16_t nb_symb_sch = 12;
  uint8_t nb_re_dmrs = 6;
  uint16_t length_dmrs = 1;

  uint32_t i,j;
//  uint32_t k;

  __m128i *pv = (__m128i*)&z;
  __m128i *pl = (__m128i*)&l;

  //NR_DL_UE_HARQ_t *harq_process = dlsch->harq_processes[0];

   if (!dlsch_llr) {
    printf("dlsch_decoding.c: NULL dlsch_llr pointer\n");
    return(dlsch->max_ldpc_iterations);
  }

  if (!harq_process) {
    printf("dlsch_decoding.c: NULL harq_process pointer\n");
    return(dlsch->max_ldpc_iterations);
  }

  if (!frame_parms) {
    printf("dlsch_decoding.c: NULL frame_parms pointer\n");
    return(dlsch->max_ldpc_iterations);
  }

  /*if (nr_tti_rx> (10*frame_parms->ttis_per_subframe-1)) {
    printf("dlsch_decoding.c: Illegal subframe index %d\n",nr_tti_rx);
    return(dlsch->max_ldpc_iterations);
  }*/

  /*if (harq_process->harq_ack.ack != 2) {
    LOG_D(PHY, "[UE %d] DLSCH @ SF%d : ACK bit is %d instead of DTX even before PDSCH is decoded!\n",
        phy_vars_ue->Mod_id, nr_tti_rx, harq_process->harq_ack.ack);
  }*/

  //  nb_rb = dlsch->nb_rb;

  /*
  if (nb_rb > frame_parms->N_RB_DL) {
    printf("dlsch_decoding.c: Illegal nb_rb %d\n",nb_rb);
    return(max_ldpc_iterations);
    }*/

  /*harq_pid = dlsch->current_harq_pid[phy_vars_ue->current_thread_id[subframe]];
  if (harq_pid >= 8) {
    printf("dlsch_decoding.c: Illegal harq_pid %d\n",harq_pid);
    return(max_ldpc_iterations);
  }
  */

  nb_rb = harq_process->nb_rb;

  harq_process->trials[harq_process->round]++;

  harq_process->TBS = nr_compute_tbs(harq_process->mcs,nb_rb,nb_symb_sch,nb_re_dmrs,length_dmrs, harq_process->Nl);

  A = harq_process->TBS;
  ret = dlsch->max_ldpc_iterations;

  harq_process->G = nr_get_G(nb_rb, nb_symb_sch, nb_re_dmrs, length_dmrs, harq_process->Qm,harq_process->Nl);
  G = harq_process->G;

  //printf("DLSCH Decoding, harq_pid %d TBS %d G %d mcs %d Nl %d nb_symb_sch %d \n",harq_pid,A,G, harq_process->mcs, harq_process->Nl, nb_symb_sch);

  if (harq_process->round == 0) {
    // This is a new packet, so compute quantities regarding segmentation
    harq_process->B = A+24;
    nr_segmentation(NULL,
    	                    NULL,
    	                    harq_process->B,
    	                    &harq_process->C,
    	                    &harq_process->K,
     						&harq_process->Z,
    	                    &harq_process->F);
    						p_decParams->Z = harq_process->Z;

#ifdef DEBUG_DLSCH_DECODING
    printf("dlsch decoding nr segmentation Z %d\n", p_decParams->Z);
	if (!frame%100)
    	printf("K %d C %d Z %d nl %d \n", harq_process->K, harq_process->C, p_decParams->Z, harq_process->Nl);
#endif
  }

      kb = harq_process->K/harq_process->Z;
  	  if ( kb==22){
  		  p_decParams->BG = 1;
  		  p_decParams->R = 13;
  		  kc = 68;
  	  }
  	  else{
  		  p_decParams->BG = 2;
  		  p_decParams->R = 13;
  		  kc = 52;
  	  	  }

      p_decParams->numMaxIter = 2;
      Kr = p_decParams->Z*kb;
      p_decParams->outMode= 0;


  err_flag = 0;
  r_offset = 0;

  unsigned char bw_scaling =1;

  switch (frame_parms->N_RB_DL) {

  case 106:
    bw_scaling =2;
    break;

  default:
    bw_scaling =1;
    break;
  }

  if (harq_process->C > MAX_NUM_NR_DLSCH_SEGMENTS/bw_scaling) {
    LOG_E(PHY,"Illegal harq_process->C %d > %d\n",harq_process->C,MAX_NUM_NR_DLSCH_SEGMENTS/bw_scaling);
    return((1+dlsch->max_ldpc_iterations));
  }
#ifdef DEBUG_DLSCH_DECODING
  printf("Segmentation: C %d, K %d\n",harq_process->C,harq_process->K);
#endif

  opp_enabled=1;

  Kr = harq_process->K;
  Kr_bytes = Kr>>3;

  K_bytes_F = Kr_bytes-(harq_process->F>>3);

  Tbslbrm = nr_compute_tbs(28,nb_rb,frame_parms->symbols_per_slot,0,0, harq_process->Nl);

  for (r=0; r<harq_process->C; r++) {

	  printf("start rx segment %d\n",r);

#if UE_TIMING_TRACE
    start_meas(dlsch_rate_unmatching_stats);
#endif

#ifdef DEBUG_DLSCH_DECODING
    LOG_D(PHY,"HARQ_PID %d Rate Matching Segment %d (coded bits %d,unpunctured/repeated bits %d, TBS %d, mod_order %d, nb_rb %d, Nl %d, rv %d, round %d)...\n",
          harq_pid,r, G,
          Kr*3,
          harq_process->TBS,
          harq_process->Qm,
          harq_process->nb_rb,
          harq_process->Nl,
          harq_process->rvidx,
          harq_process->round);
#endif

    if (nr_rate_matching_ldpc_rx(Ilbrm,
    		 	 	 	 	 	 Tbslbrm,
								 p_decParams->BG,
								 p_decParams->Z,
    							 G,
								 harq_process->w[r],
								 dlsch_llr+r_offset,
								 harq_process->C,
								 harq_process->rvidx,
								 (harq_process->round==0)?1:0,
								 harq_process->Qm,
								 harq_process->Nl,
								 r,
								 &E)==-1) {

#if UE_TIMING_TRACE
      stop_meas(dlsch_rate_unmatching_stats);
#endif
      LOG_E(PHY,"dlsch_decoding.c: Problem in rate_matching\n");
      return(dlsch->max_ldpc_iterations);
    } else
    {
#if UE_TIMING_TRACE
      stop_meas(dlsch_rate_unmatching_stats);
#endif
    }
    r_offset += E;

    //for (int i =0; i<16; i++)
    //    	printf("rx output ratematching w[%d]= %d r_offset %d\n", i,harq_process->w[r][i], r_offset);

#if UE_TIMING_TRACE
    start_meas(dlsch_deinterleaving_stats);
#endif
    nr_deinterleaving_ldpc(E,
    					   harq_process->Qm,
                           harq_process->d[r],
                           harq_process->w[r]);

    //for (int i =0; i<16; i++)
    //        	printf("rx output interleaving d[%d]= %d r_offset %d\n", i,harq_process->d[r][i], r_offset);

#if UE_TIMING_TRACE
    stop_meas(dlsch_deinterleaving_stats);
#endif
#ifdef DEBUG_DLSCH_DECODING

    if (r==0) {
              write_output("decoder_llr.m","decllr",dlsch_llr,G,1,0);
              write_output("decoder_in.m","dec",&harq_process->d[0][0],(3*8*Kr_bytes)+12,1,0);
    }

    printf("decoder input(segment %d) :",r);
    int i; for (i=0;i<(3*8*Kr_bytes)+12;i++)
      printf("%d : %d\n",i,harq_process->d[r][i]);
      printf("\n");
#endif


    //    printf("Clearing c, %p\n",harq_process->c[r]);
    memset(harq_process->c[r],0,Kr_bytes);

    //    printf("done\n");
    if (harq_process->C == 1)
      crc_type = CRC24_A;
    else
      crc_type = CRC24_B;

    if (err_flag == 0) {
/*
        LOG_I(PHY, "turbo algo Kr=%d cb_cnt=%d C=%d nbRB=%d crc_type %d TBSInput=%d TBSHarq=%d TBSplus24=%d mcs=%d Qm=%d RIV=%d round=%d maxIter %d\n",
                            Kr,r,harq_process->C,harq_process->nb_rb,crc_type,A,harq_process->TBS,
                            harq_process->B,harq_process->mcs,harq_process->Qm,harq_process->rvidx,harq_process->round,dlsch->max_ldpc_iterations);
*/

#if UE_TIMING_TRACE
        start_meas(dlsch_turbo_decoding_stats);
#endif

      //LOG_E(PHY,"AbsSubframe %d.%d Start turbo segment %d/%d A %d ",frame%1024,nr_tti_rx,r,harq_process->C-1, A);

      //printf("harq process dr iteration %d\n", p_decParams->numMaxIter);

      for (int cnt =0; cnt < (kc-2)*p_decParams->Z; cnt++){
            inv_d[cnt] = (1)*harq_process->d[r][cnt];
            }

      /*for (int cnt =0; cnt < 16; cnt++){
      printf("dr %d inv_d %d \n", harq_process->d[r][cnt], inv_d[cnt]);
      }

      printf(" \n");

      printf("end dr \n");
      for (int cnt =(50*p_decParams->Z-16) ; cnt < 50*p_decParams->Z; cnt++){
            printf("%d ", harq_process->d[r][cnt]);
            }
      printf(" \n");*/

		memset(pv,0,2*harq_process->Z*sizeof(int16_t));
        //memset(pl,0,2*p_decParams->Z*sizeof(int8_t));
    	memset((pv+K_bytes_F),127,harq_process->F*sizeof(int16_t));

      	for (i=((2*p_decParams->Z)>>3), j = 0; i < K_bytes_F+((2*p_decParams->Z)>>3); i++, j++)
      	{
      		pv[i]= _mm_loadu_si128((__m128i*)(&inv_d[8*j]));
      	}

		for (i=Kr_bytes+((2*p_decParams->Z)>>3),j=Kr_bytes; i < ((kc*p_decParams->Z)>>3); i++, j++)
		      	{
		      		pv[i]= _mm_loadu_si128((__m128i*)(&inv_d[8*j]));
		      	}
      	
		for (i=0, j=0; j < ((kc*p_decParams->Z)>>4);  i+=2, j++)
      	      	{
      				
      	      		pl[j] = _mm_packs_epi16(pv[i],pv[i+1]);
      	      		
      	      	}

		no_iteration_ldpc = nrLDPC_decoder(p_decParams,
						   (int8_t*)&pl[0],
						   llrProcBuf,
						   p_procTime);
		/*
		if (check_crc(llrProcBuf,Kr,crc_type)) {
		  printf("CRC OK\n");
		  ret = 2;
		}
		else {
		  printf("CRC NOK\n");
		  ret = 1+dlsch->max_ldpc_iterations;
		}
		*/

		nb_total_decod++;
		if (no_iteration_ldpc > 10){
		  nb_error_decod++;
		  ret = 1+dlsch->max_ldpc_iterations;
		}
		else {
		  ret=2;
		}
		//if (!nb_total_decod%10000){
				printf("Error number of iteration LPDC %d %ld/%ld \n", no_iteration_ldpc, nb_error_decod,nb_total_decod);fflush(stdout);
		//}

		//else
			//printf("OK number of iteration LPDC %d\n", no_iteration_ldpc);

		for (int m=0; m < Kr>>3; m ++)
		      	      	{
		      				harq_process->c[r][m]= (uint8_t) llrProcBuf[m];
		      	      	}

#ifdef DEBUG_DLSCH_DECODING
      printf("output decoder %d %d %d %d %d \n", harq_process->c[r][0], harq_process->c[r][1], harq_process->c[r][2],harq_process->c[r][3], harq_process->c[r][4]);
      printf("no_iterations_ldpc %d (ret %d)\n",no_iteration_ldpc,ret);
      //write_output("dec_output.m","dec0",harq_process->c[0],Kr_bytes,1,4);
#endif


#if UE_TIMING_TRACE
      stop_meas(dlsch_turbo_decoding_stats);
#endif
    }

    /*printf("Segmentation: C %d r %d, dlsch_rate_unmatching_stats %5.3f dlsch_deinterleaving_stats %5.3f  dlsch_turbo_decoding_stats %5.3f \n",
                  harq_process->C,
                  r,
                  dlsch_rate_unmatching_stats->p_time/(cpuf*1000.0),
                  dlsch_deinterleaving_stats->p_time/(cpuf*1000.0),
                  dlsch_turbo_decoding_stats->p_time/(cpuf*1000.0));*/


    if ((err_flag == 0) && (ret>=(1+dlsch->max_ldpc_iterations))) {// a Code segment is in error so break;
      LOG_W(PHY,"AbsSubframe %d.%d CRC failed, segment %d/%d \n",frame%1024,nr_tti_rx,r,harq_process->C-1);
      err_flag = 1;
    }
  }

  int32_t frame_rx_prev = frame;
  int32_t tti_rx_prev = nr_tti_rx - 1;
  if (tti_rx_prev < 0) {
    frame_rx_prev--;
    tti_rx_prev += 10*frame_parms->ttis_per_subframe;
  }
  frame_rx_prev = frame_rx_prev%1024;

  if (err_flag == 1) {
#if UE_DEBUG_TRACE
    LOG_I(PHY,"[UE %d] DLSCH: Setting NAK for SFN/SF %d/%d (pid %d, status %d, round %d, TBS %d, mcs %d) Kr %d r %d harq_process->round %d\n",
        phy_vars_ue->Mod_id, frame, nr_tti_rx, harq_pid,harq_process->status, harq_process->round,harq_process->TBS,harq_process->mcs,Kr,r,harq_process->round);
#endif
    harq_process->harq_ack.ack = 0;
    harq_process->harq_ack.harq_id = harq_pid;
    harq_process->harq_ack.send_harq_status = 1;
    harq_process->errors[harq_process->round]++;
    harq_process->round++;


    //    printf("Rate: [UE %d] DLSCH: Setting NACK for subframe %d (pid %d, round %d)\n",phy_vars_ue->Mod_id,subframe,harq_pid,harq_process->round);
    if (harq_process->round >= dlsch->Mdlharq) {
      harq_process->status = SCH_IDLE;
      harq_process->round  = 0;
    }
    if(is_crnti)
    {
    LOG_D(PHY,"[UE %d] DLSCH: Setting NACK for nr_tti_rx %d (pid %d, pid status %d, round %d/Max %d, TBS %d)\n",
               phy_vars_ue->Mod_id,nr_tti_rx,harq_pid,harq_process->status,harq_process->round,dlsch->Mdlharq,harq_process->TBS);
    }

    return((1+dlsch->max_ldpc_iterations));
  } else {
#if UE_DEBUG_TRACE
      LOG_I(PHY,"[UE %d] DLSCH: Setting ACK for nr_tti_rx %d TBS %d mcs %d nb_rb %d harq_process->round %d\n",
           phy_vars_ue->Mod_id,nr_tti_rx,harq_process->TBS,harq_process->mcs,harq_process->nb_rb, harq_process->round);
#endif

    harq_process->status = SCH_IDLE;
    harq_process->round  = 0;
    harq_process->harq_ack.ack = 1;
    harq_process->harq_ack.harq_id = harq_pid;
    harq_process->harq_ack.send_harq_status = 1;
    //LOG_I(PHY,"[UE %d] DLSCH: Setting ACK for SFN/SF %d/%d (pid %d, status %d, round %d, TBS %d, mcs %d)\n",
      //  phy_vars_ue->Mod_id, frame, subframe, harq_pid, harq_process->status, harq_process->round,harq_process->TBS,harq_process->mcs);

    if(is_crnti)
    {
    LOG_D(PHY,"[UE %d] DLSCH: Setting ACK for nr_tti_rx %d (pid %d, round %d, TBS %d)\n",phy_vars_ue->Mod_id,nr_tti_rx,harq_pid,harq_process->round,harq_process->TBS);
    }
    //LOG_D(PHY,"[UE %d] DLSCH: Setting ACK for subframe %d (pid %d, round %d)\n",phy_vars_ue->Mod_id,subframe,harq_pid,harq_process->round);

  }

  // Reassembly of Transport block here
  offset = 0;
  Kr = harq_process->K;
  Kr_bytes = Kr>>3;

  /*
  printf("harq_pid %d\n",harq_pid);
  printf("F %d, Fbytes %d\n",harq_process->F,harq_process->F>>3);
  printf("C %d\n",harq_process->C);
  */
  for (r=0; r<harq_process->C; r++) {

    memcpy(harq_process->b+offset,
             harq_process->c[r],
             Kr_bytes- - (harq_process->F>>3) -((harq_process->C>1)?3:0));
    offset += (Kr_bytes - (harq_process->F>>3) - ((harq_process->C>1)?3:0));

#ifdef DEBUG_DLSCH_DECODING
    printf("Segment %d : Kr= %d bytes\n",r,Kr_bytes);
    printf("copied %d bytes to b sequence (harq_pid %d)\n",
              (Kr_bytes - (harq_process->F>>3)-((harq_process->C>1)?3:0)),harq_pid);
              printf("b[0] = %x,c[%d] = %x\n",
              harq_process->b[offset],
              harq_process->F>>3,
              harq_process->c[r]);
#endif

  }

  dlsch->last_iteration_cnt = ret;

  return(ret);
}

#ifdef UE_DLSCH_PARALLELISATION
uint32_t  dlsch_decoding_mthread(PHY_VARS_NR_UE *phy_vars_ue,
			 UE_rxtx_proc_t *proc,
                         int eNB_id,
                         short *dlsch_llr,
                         NR_DL_FRAME_PARMS *frame_parms,
                         NR_UE_DLSCH_t *dlsch,
                         NR_DL_UE_HARQ_t *harq_process,
                         uint32_t frame,
                         uint8_t nr_tti_rx,
                         uint8_t harq_pid,
                         uint8_t is_crnti,
                         uint8_t llr8_flag)
{

#if UE_TIMING_TRACE
  time_stats_t *dlsch_rate_unmatching_stats=&phy_vars_ue->dlsch_rate_unmatching_stats;
  time_stats_t *dlsch_turbo_decoding_stats=&phy_vars_ue->dlsch_turbo_decoding_stats;
  time_stats_t *dlsch_deinterleaving_stats=&phy_vars_ue->dlsch_deinterleaving_stats;
#endif
  uint32_t A,E;
  uint32_t G;
  uint32_t ret,offset;
  uint16_t iind;
  //  uint8_t dummy_channel_output[(3*8*block_length)+12];
  short dummy_w[MAX_NUM_DLSCH_SEGMENTS][3*(8448+64)];
  uint32_t r,r_offset=0,Kr,Kr_bytes,err_flag=0,Kr_int,kb,kc;
  uint8_t crc_type;
  //UE_rxtx_proc_t *proc = &phy_vars_ue->proc;
  int32_t no_iteration_ldpc;
  int Cby2;
  /*uint8_t C;
  uint8_t Qm;
  uint8_t Nl;
  uint8_t r_thread;
  uint32_t Er, Gp,GpmodC;*/
  t_nrLDPC_dec_params decParams;
    t_nrLDPC_dec_params* p_decParams = &decParams;
    t_nrLDPC_time_stats procTime;
    t_nrLDPC_time_stats* p_procTime =&procTime ;
    int16_t z [68*384];
    int8_t l [68*384];
    //__m128i l;
    int16_t inv_d [68*384];
    int16_t *p_invd =&inv_d;

    uint32_t i,j;
    uint32_t k;

    __m128i *pv = (__m128i*)&z;
    __m128i *pl = (__m128i*)&l;


#ifdef DEBUG_DLSCH_DECODING
  uint16_t i;
#endif
  //#ifdef __AVX2__
#if 0
  int Kr_last,skipped_last=0;
  uint8_t (*tc_2cw)(int16_t *y,
		    int16_t *y2,
		    uint8_t *,
		    uint8_t *,
		    uint16_t,
		    uint16_t,
		    uint16_t,
		    uint8_t,
		    uint8_t,
		    uint8_t,
		    time_stats_t *,
		    time_stats_t *,
		    time_stats_t *,
		    time_stats_t *,
		    time_stats_t *,
		    time_stats_t *,
		    time_stats_t *);

#endif
  uint8_t (*tc)(int16_t *y,
                uint8_t *,
                uint16_t,
                uint16_t,
                uint16_t,
                uint8_t,
                uint8_t,
                uint8_t,
                time_stats_t *,
                time_stats_t *,
                time_stats_t *,
                time_stats_t *,
                time_stats_t *,
                time_stats_t *,
                time_stats_t *);




  if (!dlsch_llr) {
    printf("dlsch_decoding.c: NULL dlsch_llr pointer\n");
    return(dlsch->max_ldpc_iterations);
  }

  if (!harq_process) {
    printf("dlsch_decoding.c: NULL harq_process pointer\n");
    return(dlsch->max_ldpc_iterations);
  }

  if (!frame_parms) {
    printf("dlsch_decoding.c: NULL frame_parms pointer\n");
    return(dlsch->max_ldpc_iterations);
  }

  if (nr_tti_rx> (10*frame_parms->ttis_per_subframe-1)) {
    printf("dlsch_decoding.c: Illegal subframe index %d\n",nr_tti_rx);
    return(dlsch->max_ldpc_iterations);
  }

  if (dlsch->harq_ack[nr_tti_rx].ack != 2) {
    LOG_D(PHY, "[UE %d] DLSCH @ SF%d : ACK bit is %d instead of DTX even before PDSCH is decoded!\n",
        phy_vars_ue->Mod_id, nr_tti_rx, dlsch->harq_ack[nr_tti_rx].ack);
  }

  if (llr8_flag == 0) {
    //#ifdef __AVX2__
#if 0
    tc_2cw = phy_threegpplte_turbo_decoder16avx2;
#endif
    tc = phy_threegpplte_turbo_decoder16;
  }
  else
  {
	  AssertFatal (harq_process->TBS >= 256 , "Mismatch flag nbRB=%d TBS=%d mcs=%d Qm=%d RIV=%d round=%d \n",
			  harq_process->nb_rb, harq_process->TBS,harq_process->mcs,harq_process->Qm,harq_process->rvidx,harq_process->round);
	    tc = phy_threegpplte_turbo_decoder8;
  }


  //  nb_rb = dlsch->nb_rb;

  /*
  if (nb_rb > frame_parms->N_RB_DL) {
    printf("dlsch_decoding.c: Illegal nb_rb %d\n",nb_rb);
    return(max_ldpc_iterations);
    }*/

  /*harq_pid = dlsch->current_harq_pid[phy_vars_ue->current_thread_id[subframe]];
  if (harq_pid >= 8) {
    printf("dlsch_decoding.c: Illegal harq_pid %d\n",harq_pid);
    return(max_ldpc_iterations);
  }
  */


  harq_process->trials[harq_process->round]++;

  A = harq_process->TBS; //2072 for QPSK 1/3

  ret = dlsch->max_ldpc_iterations;


  G = harq_process->G;

  proc->decoder_main_available = 1;
  proc->decoder_thread_available = 0;
  proc->decoder_thread_available1 = 0;
  //get_G(frame_parms,nb_rb,dlsch->rb_alloc,mod_order,num_pdcch_symbols,phy_vars_ue->frame,subframe);

  //  printf("DLSCH Decoding, harq_pid %d Ndi %d\n",harq_pid,harq_process->Ndi);

  if (harq_process->round == 0) {
      // This is a new packet, so compute quantities regarding segmentation
      harq_process->B = A+24;
 #ifdef TD_DECODING
      lte_segmentation(NULL,
                       NULL,
                       harq_process->B,
                       &harq_process->C,
                       &harq_process->Cplus,
                       &harq_process->Cminus,
                       &harq_process->Kplus,
                       &harq_process->Kminus,
                       &harq_process->F);
      //  CLEAR LLR's HERE for first packet in process
  #else
      nr_segmentation(NULL,
      	                    NULL,
      	                    harq_process->B,
      	                    &harq_process->C,
      	                    &harq_process->Kplus,
      	                    &harq_process->Kminus,
      						&harq_process->Z,
      	                    &harq_process->F);
      	p_decParams->Z = harq_process->Z;
  #endif

    }

 	kb = harq_process->Kplus/harq_process->Z;
  	  if ( kb==22){
  		  p_decParams->BG = 1;
  		  p_decParams->R = 89;
  		  kc = 68;
  	  }
  	  else{
  		  p_decParams->BG = 2;
  		  p_decParams->R = 13;
  		  kc = 52;
  	  	  }

      p_decParams->numMaxIter = 2;
      Kr = p_decParams->Z*kb;
      p_decParams->outMode= 0;

  /*
  else {
    printf("dlsch_decoding.c: Ndi>0 not checked yet!!\n");
    return(max_ldpc_iterations);
  }
  */
  err_flag = 0;
  r_offset = 0;

  unsigned char bw_scaling =1;

  switch (frame_parms->N_RB_DL) {
  case 6:
    bw_scaling =16;
    break;

  case 25:
    bw_scaling =4;
    break;

  case 50:
    bw_scaling =2;
    break;

  default:
    bw_scaling =1;
    break;
  }

  if (harq_process->C > MAX_NUM_DLSCH_SEGMENTS/bw_scaling) {
    LOG_E(PHY,"Illegal harq_process->C %d > %d\n",harq_process->C,MAX_NUM_DLSCH_SEGMENTS/bw_scaling);
    return((1+dlsch->max_ldpc_iterations));
  }
#ifdef DEBUG_DLSCH_DECODING
  printf("Segmentation: C %d, Cminus %d, Kminus %d, Kplus %d\n",harq_process->C,harq_process->Cminus,harq_process->Kminus,harq_process->Kplus);
#endif

  opp_enabled=1;
if (harq_process->C>1) { // wakeup worker if more than 1 segment
     if (pthread_mutex_lock(&proc->mutex_dlsch_td) != 0) {
         LOG_E( PHY, "[SCHED][UE %d][Slot0] error locking mutex for UE dlsch td\n",phy_vars_ue->Mod_id );
         exit_fun("nothing to add");
     }

     /*Qm= harq_process->Qm;
     Nl=harq_process->Nl;
     r_thread = harq_process->C/2-1;
     C= harq_process->C;

     Gp = G/Nl/Qm;
      GpmodC = Gp%C;



      if (r_thread < (C-(GpmodC)))
        Er = Nl*Qm * (Gp/C);
      else
        Er = Nl*Qm * ((GpmodC==0?0:1) + (Gp/C));
     printf("mthread Er %d\n", Er);

     printf("mthread instance_cnt_dlsch_td %d\n",  proc->instance_cnt_dlsch_td);*/

     proc->instance_cnt_dlsch_td++;
               proc->eNB_id    = eNB_id;
               proc->harq_pid  = harq_pid;
               proc->llr8_flag = llr8_flag;
               //proc->r[0] = 1;

     if (proc->instance_cnt_dlsch_td == 0)
     {
     LOG_D(PHY,"unblock dlsch td processing thread blocked on instance_cnt_dlsch_td : %d \n", proc->instance_cnt_dlsch_td );
         if (pthread_cond_signal(&proc->cond_dlsch_td) != 0) {
             LOG_E( PHY, "[SCHED][UE %d][Slot0] ERROR pthread_cond_signal for UE dlsch td\n", phy_vars_ue->Mod_id);
             exit_fun("nothing to add");
         }
         if (pthread_mutex_unlock(&proc->mutex_dlsch_td) != 0) {
             LOG_E( PHY, "[SCHED][UE %d][Slot0] error unlocking mutex for UE dlsch td \n",phy_vars_ue->Mod_id );
             exit_fun("nothing to add");
         }

     } else
     {
         LOG_E( PHY, "[SCHED][UE %d] UE dlsch td thread busy (IC %d)!!\n", phy_vars_ue->Mod_id, proc->instance_cnt_dlsch_td);
         if (proc->instance_cnt_dlsch_td > 4)
             exit_fun("instance_cnt_dlsch_td > 4");
     }
     //AssertFatal(pthread_cond_signal(&proc->cond_slot1_dl_processing) ==0 ,"");
     AssertFatal(pthread_mutex_unlock(&proc->mutex_dlsch_td) ==0,"");

     if (harq_process->C>2) {
    	 if (pthread_mutex_lock(&proc->mutex_dlsch_td1) != 0) {
    	          LOG_E( PHY, "[SCHED][UE %d][Slot0] error locking mutex for UE dlsch td\n",phy_vars_ue->Mod_id );
    	          exit_fun("nothing to add");
    	      }

    	 proc->instance_cnt_dlsch_td1++;
                    proc->eNB_id    = eNB_id;
                    proc->harq_pid  = harq_pid;
                    proc->llr8_flag = llr8_flag;
                   // proc->Er = Er;

          if (proc->instance_cnt_dlsch_td1 == 0)
          {
          LOG_D(PHY,"unblock slot1 dl processing thread blocked on instance_cnt_dlsch_td : %d \n", proc->instance_cnt_dlsch_td1 );
              if (pthread_cond_signal(&proc->cond_dlsch_td1) != 0) {
                  LOG_E( PHY, "[SCHED][UE %d][Slot0] ERROR pthread_cond_signal for UE dlsch td\n", phy_vars_ue->Mod_id);
                  exit_fun("nothing to add");
              }
              if (pthread_mutex_unlock(&proc->mutex_dlsch_td1) != 0) {
                  LOG_E( PHY, "[SCHED][UE %d][Slot0] error unlocking mutex for UE dlsch td \n",phy_vars_ue->Mod_id );
                  exit_fun("nothing to add");
              }

          } else
          {
              LOG_E( PHY, "[SCHED][UE %d] UE dlsch td thread 1 busy (IC %d)!!\n", phy_vars_ue->Mod_id, proc->instance_cnt_dlsch_td1);
              if (proc->instance_cnt_dlsch_td1 > 4)
                  exit_fun("instance_cnt_dlsch_td1 > 4");
          }

          AssertFatal(pthread_mutex_unlock(&proc->mutex_dlsch_td1) ==0,"");

     }
	/*
	if (pthread_mutex_timedlock(&proc->mutex_td,&wait) != 0) {
       printf("[eNB] ERROR pthread_mutex_lock for TD thread (IC %d)\n", proc->instance_cnt_td);
       exit_fun( "error locking mutex_fep" );
       return -1;
     }

     if (proc->instance_cnt_td==0) {
       printf("[UE] TD thread busy\n");
       exit_fun("TD thread busy");
       pthread_mutex_unlock( &proc->mutex_td );
       return -1;
     }

     ++proc->instance_cnt_td;

     proc->tdp.UE        = phy_vars_ue;
     proc->tdp.eNB_id    = eNB_id;
     proc->tdp.harq_pid  = harq_pid;
     proc->tdp.llr8_flag = llr8_flag;

     printf("----- 2thread llr flag %d tdp flag %d\n",llr8_flag, proc->tdp.llr8_flag);


     // wakeup worker to do second half segments
     if (pthread_cond_signal(&proc->cond_td) != 0) {
       printf("[UE] ERROR pthread_cond_signal for td thread exit\n");
       exit_fun( "ERROR pthread_cond_signal" );
       return (1+dlsch->last_iteration_cnt);
     }

     pthread_mutex_unlock( &proc->mutex_td );*/


     Cby2 = 1; //harq_process->C/2;
     //proc->decoder_main_available = 1;
   }
   else {
     Cby2 = 1;
   }

  for (r=0; r<Cby2; r++) {

    // Get Turbo interleaver parameters
#ifdef TD_DECODING
	  if (r<harq_process->Cminus)
      Kr = harq_process->Kminus;
    else
      Kr = harq_process->Kplus;
#else
	  Kr = harq_process->Kplus;
#endif

    Kr_bytes = Kr>>3;

     //workaround for nr ldpc using lte interleaving
  if (dlsch->harq_processes[harq_pid]->C >= 2)
      	Kr_int = G/(3*dlsch->harq_processes[harq_pid]->C);
  else
  	Kr_int = Kr;
  
    if (Kr_bytes<=64)
      iind = (Kr_bytes-5);
    else if (Kr_bytes <=128)
      iind = 59 + ((Kr_bytes-64)>>1);
    else if (Kr_bytes <= 256)
      iind = 91 + ((Kr_bytes-128)>>2);
    else if (Kr_bytes <= 768)
      iind = 123 + ((Kr_bytes-256)>>3);
    else {
      //printf("dlsch_decoding: Illegal codeword size %d!!!\n",Kr_bytes);
      //return(dlsch->max_ldpc_iterations);
    }

#ifdef DEBUG_DLSCH_DECODING
    printf("f1 %d, f2 %d, F %d\n",f1f2mat_old[2*iind],f1f2mat_old[1+(2*iind)],(r==0) ? harq_process->F : 0);
#endif

#if UE_TIMING_TRACE
    start_meas(dlsch_rate_unmatching_stats);
#endif
    memset(&dummy_w[r][0],0,3*(8448+64)*sizeof(short));
    harq_process->RTC[r] = generate_dummy_w(Kr_int,
                                            (uint8_t*) &dummy_w[r][0],
                                            (r==0) ? harq_process->F : 0);

#ifdef DEBUG_DLSCH_DECODING
    LOG_D(PHY,"HARQ_PID %d Rate Matching Segment %d (coded bits %d,unpunctured/repeated bits %d, TBS %d, mod_order %d, nb_rb %d, Nl %d, rv %d, round %d)...\n",
          harq_pid,r, G,
          Kr*3,
          harq_process->TBS,
          harq_process->Qm,
          harq_process->nb_rb,
          harq_process->Nl,
          harq_process->rvidx,
          harq_process->round);
#endif

#ifdef DEBUG_DLSCH_DECODING
    printf(" in decoding dlsch->harq_processes[harq_pid]->rvidx = %d\n", dlsch->harq_processes[harq_pid]->rvidx);
#endif
    if (lte_rate_matching_turbo_rx(harq_process->RTC[r],
                                   G,
                                   harq_process->w[r],
                                   (uint8_t*)&dummy_w[r][0],
                                   dlsch_llr+r_offset,
                                   harq_process->C,
                                   dlsch->Nsoft,
                                   dlsch->Mdlharq,
                                   dlsch->Kmimo,
                                   harq_process->rvidx,
                                   (harq_process->round==0)?1:0,
                                   harq_process->Qm,
                                   harq_process->Nl,
                                   r,
                                   &E)==-1) {
#if UE_TIMING_TRACE
      stop_meas(dlsch_rate_unmatching_stats);
#endif
      LOG_E(PHY,"dlsch_decoding.c: Problem in rate_matching\n");
      return(dlsch->max_ldpc_iterations);
    } else
    {
#if UE_TIMING_TRACE
      stop_meas(dlsch_rate_unmatching_stats);
#endif
    }
    r_offset += E;
    //printf("main thread r_offset %d\n",r_offset);

    /*
    printf("Subblock deinterleaving, d %p w %p\n",
     harq_process->d[r],
     harq_process->w);
    */
#if UE_TIMING_TRACE
    start_meas(dlsch_deinterleaving_stats);
#endif
    sub_block_deinterleaving_turbo(4+Kr,
                                   &harq_process->d[r][96],

                                   harq_process->w[r]);
#if UE_TIMING_TRACE
    stop_meas(dlsch_deinterleaving_stats);
#endif
#ifdef DEBUG_DLSCH_DECODING
    /*
    if (r==0) {
              write_output("decoder_llr.m","decllr",dlsch_llr,G,1,0);
              write_output("decoder_in.m","dec",&harq_process->d[0][96],(3*8*Kr_bytes)+12,1,0);
    }

    printf("decoder input(segment %d) :",r);
    int i; for (i=0;i<(3*8*Kr_bytes)+12;i++)
      printf("%d : %d\n",i,harq_process->d[r][96+i]);
      printf("\n");*/
#endif


    //    printf("Clearing c, %p\n",harq_process->c[r]);
    memset(harq_process->c[r],0,Kr_bytes);

    //    printf("done\n");
    if (harq_process->C == 1)
      crc_type = CRC24_A;
    else
      crc_type = CRC24_B;

    /*
    printf("decoder input(segment %d)\n",r);
    for (i=0;i<(3*8*Kr_bytes)+12;i++)
      if ((harq_process->d[r][96+i]>7) ||
    (harq_process->d[r][96+i] < -8))
    printf("%d : %d\n",i,harq_process->d[r][96+i]);
    printf("\n");
    */

    //#ifndef __AVX2__
#if 1
    if (err_flag == 0) {
/*
        LOG_I(PHY, "turbo algo Kr=%d cb_cnt=%d C=%d nbRB=%d crc_type %d TBSInput=%d TBSHarq=%d TBSplus24=%d mcs=%d Qm=%d RIV=%d round=%d maxIter %d\n",
                            Kr,r,harq_process->C,harq_process->nb_rb,crc_type,A,harq_process->TBS,
                            harq_process->B,harq_process->mcs,harq_process->Qm,harq_process->rvidx,harq_process->round,dlsch->max_ldpc_iterations);
*/
    	if (llr8_flag) {
    		AssertFatal (Kr >= 256, "turbo algo issue Kr=%d cb_cnt=%d C=%d nbRB=%d TBSInput=%d TBSHarq=%d TBSplus24=%d mcs=%d Qm=%d RIV=%d round=%d\n",
    				Kr,r,harq_process->C,harq_process->nb_rb,A,harq_process->TBS,harq_process->B,harq_process->mcs,harq_process->Qm,harq_process->rvidx,harq_process->round);
    	}
#if UE_TIMING_TRACE
        start_meas(dlsch_turbo_decoding_stats);
#endif
      LOG_D(PHY,"mthread AbsSubframe %d.%d Start turbo segment %d/%d \n",frame%1024,nr_tti_rx,r,harq_process->C-1);
#ifdef TD_DECODING
      ret = tc
            (&harq_process->d[r][96],
             harq_process->c[r],
             Kr,
             f1f2mat_old[iind*2],
             f1f2mat_old[(iind*2)+1],
             dlsch->max_ldpc_iterations,
             crc_type,
             (r==0) ? harq_process->F : 0,
             &phy_vars_ue->dlsch_tc_init_stats,
             &phy_vars_ue->dlsch_tc_alpha_stats,
             &phy_vars_ue->dlsch_tc_beta_stats,
             &phy_vars_ue->dlsch_tc_gamma_stats,
             &phy_vars_ue->dlsch_tc_ext_stats,
             &phy_vars_ue->dlsch_tc_intl1_stats,
             &phy_vars_ue->dlsch_tc_intl2_stats); //(is_crnti==0)?harq_pid:harq_pid+1);

#else

		memset(pv,0,2*p_decParams->Z*sizeof(int16_t));
        //memset(pl,0,2*p_decParams->Z*sizeof(int8_t));

		//if (A < 1000){

      	for (i=2*p_decParams->Z/8, j = 0; i < (68*p_decParams->Z/8+1); i++, j++)
      	{
      		pv[i]= _mm_loadu_si128((__m128i*)(&inv_d[8*j]));
      	}
      	/*}
		else{
      	for (i=2*p_decParams->Z/8, j = 0; i < (68*p_decParams->Z/8+1); i++, j++)
      	      	{
      	      		pv[i]= _mm_loadu_si128((__m128i*)&harq_process->d[r][96+8*j]);
      	      	}
		}*/
		for (i=0, j=0; j < (68*p_decParams->Z/16);  i+=2, j++)
      	      	{
      				//printf("mm packs i %d j %d\n", i, j);
      				//print128_num(pv[i]);
      				//print128_num(pv[i+1]);
      	      		pl[j] = _mm_packs_epi16(pv[i],pv[i+1]);
      	      		//print128_num2bytes(pl[j]);
      	      	}

		no_iteration_ldpc = nrLDPC_decoder(p_decParams,
      			&pl[0],
				llrProcBuf,
          		p_procTime);

		if (no_iteration_ldpc > 2)
			printf("Error number of iteration LPDC %d\n", no_iteration_ldpc);
		//else
			//printf("OK number of iteration LPDC %d\n", no_iteration_ldpc);

		for (int m=0; m < Kr>>3; m ++)
		      	      	{
		      				harq_process->c[r][m]= (uint8_t) llrProcBuf[m];
		      	      	}

		      	/*for (int u=0; u < Kr>>3; u ++)
		      	      	      	{
		      						ullrProcBuf[u]= (uint8_t) llrProcBuf[u];
		      	      	      	}


		      	printf("output unsigned ullrProcBuf \n");

		      	for (int j=0; j < Kr>>3; j ++)
		      	      	       	      	{

		      	      						printf(" %d \n", ullrProcBuf[j]);

		      	      	      	      	}
		     	printf(" \n");*/
#endif
		//printf("output channel decoder %d %d %d %d %d \n", harq_process->c[r][0], harq_process->c[r][1], harq_process->c[r][2],harq_process->c[r][3], harq_process->c[r][4]);

		 //printf("output decoder %d %d %d %d %d \n", harq_process->c[r][0], harq_process->c[r][1], harq_process->c[r][2],harq_process->c[r][3], harq_process->c[r][4]);


#if UE_TIMING_TRACE
      stop_meas(dlsch_turbo_decoding_stats);
#endif
    }
#else
    if ((harq_process->C == 1) ||
	((r==harq_process->C-1) && (skipped_last==0))) { // last segment with odd number of segments

#if UE_TIMING_TRACE
        start_meas(dlsch_turbo_decoding_stats);
#endif
      ret = tc
            (&harq_process->d[r][96],
             harq_process->c[r],
             Kr,
             f1f2mat_old[iind*2],
             f1f2mat_old[(iind*2)+1],
             dlsch->max_ldpc_iterations,
             crc_type,
             (r==0) ? harq_process->F : 0,
             &phy_vars_ue->dlsch_tc_init_stats,
             &phy_vars_ue->dlsch_tc_alpha_stats,
             &phy_vars_ue->dlsch_tc_beta_stats,
             &phy_vars_ue->dlsch_tc_gamma_stats,
             &phy_vars_ue->dlsch_tc_ext_stats,
             &phy_vars_ue->dlsch_tc_intl1_stats,
             &phy_vars_ue->dlsch_tc_intl2_stats); //(is_crnti==0)?harq_pid:harq_pid+1);
 #if UE_TIMING_TRACE
      stop_meas(dlsch_turbo_decoding_stats);
#endif
      //      printf("single decode, exit\n");
      //      exit(-1);
    }
    else {
    // we can merge code segments
      if ((skipped_last == 0) && (r<harq_process->C-1)) {
	skipped_last = 1;
	Kr_last = Kr;
      }
      else {
	skipped_last=0;

	if (Kr_last == Kr) { // decode 2 code segments with AVX2 version
#ifdef DEBUG_DLSCH_DECODING
	  printf("single decoding segment %d (%p)\n",r-1,&harq_process->d[r-1][96]);
#endif
#if UE_TIMING_TRACE
	  start_meas(dlsch_turbo_decoding_stats);
#endif
#ifdef DEBUG_DLSCH_DECODING
	  printf("double decoding segments %d,%d (%p,%p)\n",r-1,r,&harq_process->d[r-1][96],&harq_process->d[r][96]);
#endif
	  ret = tc_2cw
            (&harq_process->d[r-1][96],
	     &harq_process->d[r][96],
             harq_process->c[r-1],
             harq_process->c[r],
             Kr,
             f1f2mat_old[iind*2],
             f1f2mat_old[(iind*2)+1],
             dlsch->max_ldpc_iterations,
             crc_type,
             (r==0) ? harq_process->F : 0,
             &phy_vars_ue->dlsch_tc_init_stats,
             &phy_vars_ue->dlsch_tc_alpha_stats,
             &phy_vars_ue->dlsch_tc_beta_stats,
             &phy_vars_ue->dlsch_tc_gamma_stats,
             &phy_vars_ue->dlsch_tc_ext_stats,
             &phy_vars_ue->dlsch_tc_intl1_stats,
             &phy_vars_ue->dlsch_tc_intl2_stats); //(is_crnti==0)?harq_pid:harq_pid+1);
	  /*
	  ret = tc
            (&harq_process->d[r-1][96],
             harq_process->c[r-1],
             Kr_last,
             f1f2mat_old[iind*2],
             f1f2mat_old[(iind*2)+1],
             dlsch->max_ldpc_iterations,
             crc_type,
             (r==0) ? harq_process->F : 0,
             &phy_vars_ue->dlsch_tc_init_stats,
             &phy_vars_ue->dlsch_tc_alpha_stats,
             &phy_vars_ue->dlsch_tc_beta_stats,
             &phy_vars_ue->dlsch_tc_gamma_stats,
             &phy_vars_ue->dlsch_tc_ext_stats,
             &phy_vars_ue->dlsch_tc_intl1_stats,
             &phy_vars_ue->dlsch_tc_intl2_stats); //(is_crnti==0)?harq_pid:harq_pid+1);

	     exit(-1);*/
#if UE_TIMING_TRACE
      stop_meas(dlsch_turbo_decoding_stats);
#endif
	}
	else { // Kr_last != Kr
#if UE_TIMING_TRACE
	  start_meas(dlsch_turbo_decoding_stats);
#endif
	  ret = tc
            (&harq_process->d[r-1][96],
             harq_process->c[r-1],
             Kr_last,
             f1f2mat_old[iind*2],
             f1f2mat_old[(iind*2)+1],
             dlsch->max_ldpc_iterations,
             crc_type,
             (r==0) ? harq_process->F : 0,
             &phy_vars_ue->dlsch_tc_init_stats,
             &phy_vars_ue->dlsch_tc_alpha_stats,
             &phy_vars_ue->dlsch_tc_beta_stats,
             &phy_vars_ue->dlsch_tc_gamma_stats,
             &phy_vars_ue->dlsch_tc_ext_stats,
             &phy_vars_ue->dlsch_tc_intl1_stats,
             &phy_vars_ue->dlsch_tc_intl2_stats); //(is_crnti==0)?harq_pid:harq_pid+1);
#if UE_TIMING_TRACE
      stop_meas(dlsch_turbo_decoding_stats);

	  start_meas(dlsch_turbo_decoding_stats);
#endif

	  ret = tc
            (&harq_process->d[r][96],
             harq_process->c[r],
             Kr,
             f1f2mat_old[iind*2],
             f1f2mat_old[(iind*2)+1],
             dlsch->max_ldpc_iterations,
             crc_type,
             (r==0) ? harq_process->F : 0,
             &phy_vars_ue->dlsch_tc_init_stats,
             &phy_vars_ue->dlsch_tc_alpha_stats,
             &phy_vars_ue->dlsch_tc_beta_stats,
             &phy_vars_ue->dlsch_tc_gamma_stats,
             &phy_vars_ue->dlsch_tc_ext_stats,
             &phy_vars_ue->dlsch_tc_intl1_stats,
             &phy_vars_ue->dlsch_tc_intl2_stats); //(is_crnti==0)?harq_pid:harq_pid+1);

#if UE_TIMING_TRACE

	  stop_meas(dlsch_turbo_decoding_stats);

	  /*printf("Segmentation: C %d r %d, dlsch_rate_unmatching_stats %5.3f dlsch_deinterleaving_stats %5.3f  dlsch_turbo_decoding_stats %5.3f \n",
              harq_process->C,
              r,
              dlsch_rate_unmatching_stats->p_time/(cpuf*1000.0),
              dlsch_deinterleaving_stats->p_time/(cpuf*1000.0),
              dlsch_turbo_decoding_stats->p_time/(cpuf*1000.0));*/
#endif
	}
      }
    }
#endif


    if ((err_flag == 0) && (ret>=(1+dlsch->max_ldpc_iterations))) {// a Code segment is in error so break;
      LOG_D(PHY,"AbsSubframe %d.%d CRC failed, segment %d/%d \n",frame%1024,nr_tti_rx,r,harq_process->C-1);
      err_flag = 1;
    }
  }

  int32_t frame_rx_prev = frame;
  int32_t tti_rx_prev = nr_tti_rx - 1;
  if (tti_rx_prev < 0) {
    frame_rx_prev--;
    tti_rx_prev += 10*frame_parms->ttis_per_subframe;
  }
  frame_rx_prev = frame_rx_prev%1024;

  if (err_flag == 1) {
#if UE_DEBUG_TRACE
    LOG_I(PHY,"[UE %d] DLSCH: Setting NAK for SFN/SF %d/%d (pid %d, status %d, round %d, TBS %d, mcs %d) Kr %d r %d harq_process->round %d\n",
        phy_vars_ue->Mod_id, frame, nr_tti_rx, harq_pid,harq_process->status, harq_process->round,harq_process->TBS,harq_process->mcs,Kr,r,harq_process->round);
#endif
    dlsch->harq_ack[nr_tti_rx].ack = 0;
    dlsch->harq_ack[nr_tti_rx].harq_id = harq_pid;
    dlsch->harq_ack[nr_tti_rx].send_harq_status = 1;
    harq_process->errors[harq_process->round]++;
    harq_process->round++;


    //    printf("Rate: [UE %d] DLSCH: Setting NACK for subframe %d (pid %d, round %d)\n",phy_vars_ue->Mod_id,subframe,harq_pid,harq_process->round);
    if (harq_process->round >= dlsch->Mdlharq) {
      harq_process->status = SCH_IDLE;
      harq_process->round  = 0;
    }
    if(is_crnti)
    {
    LOG_D(PHY,"[UE %d] DLSCH: Setting NACK for nr_tti_rx %d (pid %d, pid status %d, round %d/Max %d, TBS %d)\n",
               phy_vars_ue->Mod_id,nr_tti_rx,harq_pid,harq_process->status,harq_process->round,dlsch->Mdlharq,harq_process->TBS);
    }

    return((1+dlsch->max_ldpc_iterations));
  } else {
#if UE_DEBUG_TRACE
      LOG_I(PHY,"[UE %d] DLSCH: Setting ACK for nr_tti_rx %d TBS %d mcs %d nb_rb %d\n",
           phy_vars_ue->Mod_id,nr_tti_rx,harq_process->TBS,harq_process->mcs,harq_process->nb_rb);
#endif

    harq_process->status = SCH_IDLE;
    harq_process->round  = 0;
    dlsch->harq_ack[nr_tti_rx].ack = 1;
    dlsch->harq_ack[nr_tti_rx].harq_id = harq_pid;
    dlsch->harq_ack[nr_tti_rx].send_harq_status = 1;
    //LOG_I(PHY,"[UE %d] DLSCH: Setting ACK for SFN/SF %d/%d (pid %d, status %d, round %d, TBS %d, mcs %d)\n",
      //  phy_vars_ue->Mod_id, frame, subframe, harq_pid, harq_process->status, harq_process->round,harq_process->TBS,harq_process->mcs);

    if(is_crnti)
    {
    LOG_D(PHY,"[UE %d] DLSCH: Setting ACK for nr_tti_rx %d (pid %d, round %d, TBS %d)\n",phy_vars_ue->Mod_id,nr_tti_rx,harq_pid,harq_process->round,harq_process->TBS);
    }
    //LOG_D(PHY,"[UE %d] DLSCH: Setting ACK for subframe %d (pid %d, round %d)\n",phy_vars_ue->Mod_id,subframe,harq_pid,harq_process->round);

  }

  // Reassembly of Transport block here
  offset = 0;

  /*
  printf("harq_pid %d\n",harq_pid);
  printf("F %d, Fbytes %d\n",harq_process->F,harq_process->F>>3);
  printf("C %d\n",harq_process->C);
  */
  uint32_t wait = 0;
  if (harq_process->C==2){
	  while((proc->decoder_thread_available == 0) )
  {
          usleep(1);
          wait++;
  }
  }
  else if ((harq_process->C==3) ){
	  while((proc->decoder_thread_available == 0) || (proc->decoder_thread_available1 == 0))
			  {
			            usleep(1);
			            wait++;
			    }
  }

  proc->decoder_main_available = 0;
  
  for (r=0; r<harq_process->C; r++) {
    if (r<harq_process->Cminus)
      Kr = harq_process->Kminus;
    else
      Kr = harq_process->Kplus;

    Kr_bytes = Kr>>3;

    //    printf("Segment %d : Kr= %d bytes\n",r,Kr_bytes);
    if (r==0) {
      memcpy(harq_process->b,
             &harq_process->c[0][(harq_process->F>>3)],
             Kr_bytes - (harq_process->F>>3)- ((harq_process->C>1)?3:0));
      offset = Kr_bytes - (harq_process->F>>3) - ((harq_process->C>1)?3:0);
      //            printf("copied %d bytes to b sequence (harq_pid %d)\n",
      //          Kr_bytes - (harq_process->F>>3),harq_pid);
      //          printf("b[0] = %x,c[%d] = %x\n",
      //      harq_process->b[0],
      //      harq_process->F>>3,
      //      harq_process->c[0][(harq_process->F>>3)]);
    } else {
      memcpy(harq_process->b+offset,
             harq_process->c[r],
             Kr_bytes- ((harq_process->C>1)?3:0));
      offset += (Kr_bytes - ((harq_process->C>1)?3:0));
    }
  }

  dlsch->last_iteration_cnt = ret;
  //proc->decoder_thread_available = 0;
  //proc->decoder_main_available = 0;
  
  //wait for worker to finish
  //wait_on_busy_condition(&proc->mutex_td,&proc->cond_td,&proc->instance_cnt_dlsch td,"dlsch td thread");

  //return( (ret>proc->tdp.ret) ? ret : proc->tdp.ret );

  return(ret);
}
#endif

#ifdef UE_DLSCH_PARALLELISATION
#define FIFO_PRIORITY   39
uint32_t  dlsch_decoding_2thread0(void *arg)
{
	static __thread int UE_dlsch_td_retval;
    struct rx_tx_thread_data *rtd = arg;
    UE_rxtx_proc_t *proc = rtd->proc;
    PHY_VARS_NR_UE    *phy_vars_ue   = rtd->UE;

    int llr8_flag1;
    int32_t no_iteration_ldpc;
    t_nrLDPC_dec_params decParams;
    t_nrLDPC_dec_params* p_decParams = &decParams;
    t_nrLDPC_time_stats procTime;
    t_nrLDPC_time_stats* p_procTime =&procTime ;
    int16_t z [68*384];
    int8_t l [68*384];
    //__m128i l;
    int16_t inv_d [68*384];
    int16_t *p_invd =&inv_d;

    uint32_t i,j;
    uint32_t k;

    __m128i *pv = (__m128i*)&z;
    __m128i *pl = (__m128i*)&l;

    proc->instance_cnt_dlsch_td=-1;
    proc->nr_tti_rx=proc->sub_frame_start;

    proc->decoder_thread_available = 0;
    
    char threadname[256];
    sprintf(threadname,"UE_thread_dlsch_td_%d", proc->sub_frame_start);

	cpu_set_t cpuset;
    CPU_ZERO(&cpuset);
    if ( (proc->sub_frame_start+1)%RX_NB_TH == 0 && threads.dlsch_td_one != -1 )
    	CPU_SET(threads.dlsch_td_one, &cpuset);
    if ( (proc->sub_frame_start+1)%RX_NB_TH == 1 && threads.dlsch_td_two != -1 )
    	CPU_SET(threads.dlsch_td_two, &cpuset);
    if ( (proc->sub_frame_start+1)%RX_NB_TH == 2 && threads.dlsch_td_three != -1 )
    	CPU_SET(threads.dlsch_td_three, &cpuset);



#if UE_TIMING_TRACE
  time_stats_t *dlsch_rate_unmatching_stats=&phy_vars_ue->dlsch_rate_unmatching_stats;
  time_stats_t *dlsch_turbo_decoding_stats=&phy_vars_ue->dlsch_turbo_decoding_stats;
  time_stats_t *dlsch_deinterleaving_stats=&phy_vars_ue->dlsch_deinterleaving_stats;
#endif
  uint32_t A,E;
  uint32_t G;
  uint32_t ret,offset;
  uint16_t iind;
  //  uint8_t dummy_channel_output[(3*8*block_length)+12];
  short dummy_w[MAX_NUM_DLSCH_SEGMENTS][3*(8448+64)];
  uint32_t r,r_offset=0,Kr,Kr_bytes,err_flag=0,Kr_int;
  uint8_t crc_type;
  uint8_t C;
   uint8_t Qm;
   uint8_t Nl;
   uint32_t Er, Gp,GpmodC;
#ifdef DEBUG_DLSCH_DECODING
  uint16_t i;
#endif
  //#ifdef __AVX2__

  uint8_t (*tc)(int16_t *y,
                uint8_t *,
                uint16_t,
                uint16_t,
                uint16_t,
                uint8_t,
                uint8_t,
                uint8_t,
                time_stats_t *,
                time_stats_t *,
                time_stats_t *,
                time_stats_t *,
                time_stats_t *,
                time_stats_t *,
                time_stats_t *);
  if (llr8_flag1 == 0) {
    //#ifdef __AVX2__
#if 0
    tc_2cw = phy_threegpplte_turbo_decoder16avx2;
#endif
    tc = phy_threegpplte_turbo_decoder16;
  }
  else
  {
	  //AssertFatal (harq_process->TBS >= 256 , "Mismatch flag nbRB=%d TBS=%d mcs=%d Qm=%d RIV=%d round=%d \n",
	  //		  harq_process->nb_rb, harq_process->TBS,harq_process->mcs,harq_process->Qm,harq_process->rvidx,harq_process->round);
	    tc = phy_threegpplte_turbo_decoder8;
  }

	init_thread(900000,1000000 , FIFO_PRIORITY-1, &cpuset, threadname);
	while (!oai_exit) {

		//proc->decoder_thread_available = 1;
		
	        if (pthread_mutex_lock(&proc->mutex_dlsch_td) != 0) {
	            LOG_E( PHY, "[SCHED][UE] error locking mutex for UE dlsch td\n" );
	            exit_fun("nothing to add");
	        }
	        while (proc->instance_cnt_dlsch_td < 0) {
	            // most of the time, the thread is waiting here
	            pthread_cond_wait( &proc->cond_dlsch_td, &proc->mutex_dlsch_td );
	        }
	        if (pthread_mutex_unlock(&proc->mutex_dlsch_td) != 0) {
	            LOG_E( PHY, "[SCHED][UE] error unlocking mutex for UE dlsch_td \n" );
	            exit_fun("nothing to add");
	        }

	        uint32_t wait = 0;
	        	          while(proc->decoder_main_available == 0)
	        	          {
	        	                  usleep(1);
	        	                  wait++;
	        	          }

	        	          //proc->decoder_thread_available = 0;
	        //PHY_VARS_NR_UE *phy_vars_ue   		= tdp->UE;
	        	int eNB_id         				= proc->eNB_id;
	        	int harq_pid      				= proc->harq_pid;
	        	llr8_flag1     					= proc->llr8_flag;
	        	//r_offset						= proc->Er;
	        	//UE_rxtx_proc_t *proc    		= tdp->proc;
	        	int frame                       = proc->frame_rx;
	        	int subframe      				= proc->nr_tti_rx;
	        	NR_UE_DLSCH_t *dlsch 			= phy_vars_ue->dlsch[phy_vars_ue->current_thread_id[subframe]][eNB_id][0];
	        	NR_DL_UE_HARQ_t *harq_process  = dlsch->harq_processes[harq_pid];
	        	short *dlsch_llr 				= phy_vars_ue->pdsch_vars[phy_vars_ue->current_thread_id[subframe]][eNB_id]->llr[0];
	        	//printf("2thread0 llr flag %d tdp flag %d\n",llr8_flag1, tdp->llr8_flag);
	        	
  //  nb_rb = dlsch->nb_rb;

  /*
  if (nb_rb > frame_parms->N_RB_DL) {
    printf("dlsch_decoding.c: Illegal nb_rb %d\n",nb_rb);
    return(max_ldpc_iterations);
    }*/

  /*harq_pid = dlsch->current_harq_pid[phy_vars_ue->current_thread_id[subframe]];
  if (harq_pid >= 8) {
    printf("dlsch_decoding.c: Illegal harq_pid %d\n",harq_pid);
    return(max_ldpc_iterations);
  }
  */



  harq_process->trials[harq_process->round]++;

  A = harq_process->TBS; //2072 for QPSK 1/3

  ret = dlsch->max_ldpc_iterations;


  G = harq_process->G;
  //get_G(frame_parms,nb_rb,dlsch->rb_alloc,mod_order,num_pdcch_symbols,phy_vars_ue->frame,subframe);
    
  if (harq_process->round == 0) {
    // This is a new packet, so compute quantities regarding segmentation
    harq_process->B = A+24;
 #ifdef TD_DECODING
      lte_segmentation(NULL,
                       NULL,
                       harq_process->B,
                       &harq_process->C,
                       &harq_process->Cplus,
                       &harq_process->Cminus,
                       &harq_process->Kplus,
                       &harq_process->Kminus,
                       &harq_process->F);
      //  CLEAR LLR's HERE for first packet in process
  #else
      nr_segmentation(NULL,
      	                    NULL,
      	                    harq_process->B,
      	                    &harq_process->C,
      	                    &harq_process->Kplus,
      	                    &harq_process->Kminus,
      						&harq_process->Z,
      	                    &harq_process->F);
      	p_decParams->Z = harq_process->Z;
  #endif

    }

    //	  p_decParams->Z = 128;
        p_decParams->BG = 1;
        p_decParams->R = 89;
        p_decParams->numMaxIter = 2;
        Kr = p_decParams->Z*22;
        p_decParams->outMode= 0;

  /*
  else {
    printf("dlsch_decoding.c: Ndi>0 not checked yet!!\n");
    return(max_ldpc_iterations);
  }
  */
  err_flag = 0;
  //r_offset = 0;

  /*
  unsigned char bw_scaling =1;

  switch (frame_parms->N_RB_DL) {
  case 6:
    bw_scaling =16;
    break;

  case 25:
    bw_scaling =4;
    break;

  case 50:
    bw_scaling =2;
    break;

  default:
    bw_scaling =1;
    break;
  }

  if (harq_process->C > MAX_NUM_DLSCH_SEGMENTS/bw_scaling) {
    LOG_E(PHY,"Illegal harq_process->C %d > %d\n",harq_process->C,MAX_NUM_DLSCH_SEGMENTS/bw_scaling);
    return((1+dlsch->max_ldpc_iterations));
  }*/
#ifdef DEBUG_DLSCH_DECODING
  printf("Segmentation: C %d, Cminus %d, Kminus %d, Kplus %d\n",harq_process->C,harq_process->Cminus,harq_process->Kminus,harq_process->Kplus);
#endif

  opp_enabled=1;
  
  Qm= harq_process->Qm;
  	       Nl=harq_process->Nl;
  	       //r_thread = harq_process->C/2-1;
  	       C= harq_process->C;

  	       Gp = G/Nl/Qm;
  	        GpmodC = Gp%C;



  	        if ((C/2-1) < (C-(GpmodC)))
  	        	r_offset = Nl*Qm * (Gp/C);
  	        else
  	        	r_offset = Nl*Qm * ((GpmodC==0?0:1) + (Gp/C));

		//	printf("thread0 r_offset %d\n",r_offset);
  	       
  //for (r=(harq_process->C/2); r<harq_process->C; r++) {
  	     r=1; //(harq_process->C/2);

    // Get Turbo interleaver parameters
#ifdef TD_DECODING
	  if (r<harq_process->Cminus)
      Kr = harq_process->Kminus;
    else
      Kr = harq_process->Kplus;
#else
	  Kr = harq_process->Kplus;
#endif

    Kr_bytes = Kr>>3;

     //workaround for nr ldpc using lte interleaving
  if (dlsch->harq_processes[harq_pid]->C >= 2)
      	Kr_int = G/(3*dlsch->harq_processes[harq_pid]->C);
  else
  	Kr_int = Kr;

    if (Kr_bytes<=64)
      iind = (Kr_bytes-5);
    else if (Kr_bytes <=128)
      iind = 59 + ((Kr_bytes-64)>>1);
    else if (Kr_bytes <= 256)
      iind = 91 + ((Kr_bytes-128)>>2);
    else if (Kr_bytes <= 768)
      iind = 123 + ((Kr_bytes-256)>>3);
    else {
      //printf("dlsch_decoding: Illegal codeword size %d!!!\n",Kr_bytes);
      //return(dlsch->max_ldpc_iterations);
    }

#ifdef DEBUG_DLSCH_DECODING
    printf("f1 %d, f2 %d, F %d\n",f1f2mat_old[2*iind],f1f2mat_old[1+(2*iind)],(r==0) ? harq_process->F : 0);
#endif

#if UE_TIMING_TRACE
    start_meas(dlsch_rate_unmatching_stats);
#endif
    memset(&dummy_w[r][0],0,3*(8448+64)*sizeof(short));
    harq_process->RTC[r] = generate_dummy_w(Kr_int,
                                            (uint8_t*) &dummy_w[r][0],
                                            (r==0) ? harq_process->F : 0);

#ifdef DEBUG_DLSCH_DECODING
    LOG_D(PHY,"HARQ_PID %d Rate Matching Segment %d (coded bits %d,unpunctured/repeated bits %d, TBS %d, mod_order %d, nb_rb %d, Nl %d, rv %d, round %d)...\n",
          harq_pid,r, G,
          Kr*3,
          harq_process->TBS,
          harq_process->Qm,
          harq_process->nb_rb,
          harq_process->Nl,
          harq_process->rvidx,
          harq_process->round);
#endif

#ifdef DEBUG_DLSCH_DECODING
    printf(" in decoding dlsch->harq_processes[harq_pid]->rvidx = %d\n", dlsch->harq_processes[harq_pid]->rvidx);
#endif
    if (lte_rate_matching_turbo_rx(harq_process->RTC[r],
                                   G,
                                   harq_process->w[r],
                                   (uint8_t*)&dummy_w[r][0],
                                   dlsch_llr+r_offset,
                                   harq_process->C,
                                   dlsch->Nsoft,
                                   dlsch->Mdlharq,
                                   dlsch->Kmimo,
                                   harq_process->rvidx,
                                   (harq_process->round==0)?1:0,
                                   harq_process->Qm,
                                   harq_process->Nl,
                                   r,
                                   &E)==-1) {
#if UE_TIMING_TRACE
      stop_meas(dlsch_rate_unmatching_stats);
#endif
      LOG_E(PHY,"dlsch_decoding.c: Problem in rate_matching\n");
      //return(dlsch->max_ldpc_iterations);
    } else
    {
#if UE_TIMING_TRACE
      stop_meas(dlsch_rate_unmatching_stats);
#endif
    }
    r_offset += E;

    /*
    printf("Subblock deinterleaving, d %p w %p\n",
     harq_process->d[r],
     harq_process->w);
    */
#if UE_TIMING_TRACE
    start_meas(dlsch_deinterleaving_stats);
#endif
    sub_block_deinterleaving_turbo(4+Kr,
                                   &harq_process->d[r][96],

                                   harq_process->w[r]);
#if UE_TIMING_TRACE
    stop_meas(dlsch_deinterleaving_stats);
#endif
#ifdef DEBUG_DLSCH_DECODING
    /*
    if (r==0) {
              write_output("decoder_llr.m","decllr",dlsch_llr,G,1,0);
              write_output("decoder_in.m","dec",&harq_process->d[0][96],(3*8*Kr_bytes)+12,1,0);
    }

    printf("decoder input(segment %d) :",r);
    int i; for (i=0;i<(3*8*Kr_bytes)+12;i++)
      printf("%d : %d\n",i,harq_process->d[r][96+i]);
      printf("\n");*/
#endif


    //    printf("Clearing c, %p\n",harq_process->c[r]);
    memset(harq_process->c[r],0,Kr_bytes);

    //    printf("done\n");
    if (harq_process->C == 1)
      crc_type = CRC24_A;
    else
      crc_type = CRC24_B;

    /*
    printf("decoder input(segment %d)\n",r);
    for (i=0;i<(3*8*Kr_bytes)+12;i++)
      if ((harq_process->d[r][96+i]>7) ||
    (harq_process->d[r][96+i] < -8))
    printf("%d : %d\n",i,harq_process->d[r][96+i]);
    printf("\n");
    */

    //#ifndef __AVX2__
#if 1
    if (err_flag == 0) {
/*
        LOG_I(PHY, "turbo algo Kr=%d cb_cnt=%d C=%d nbRB=%d crc_type %d TBSInput=%d TBSHarq=%d TBSplus24=%d mcs=%d Qm=%d RIV=%d round=%d maxIter %d\n",
                            Kr,r,harq_process->C,harq_process->nb_rb,crc_type,A,harq_process->TBS,
                            harq_process->B,harq_process->mcs,harq_process->Qm,harq_process->rvidx,harq_process->round,dlsch->max_ldpc_iterations);
*/
    	if (llr8_flag1) {
    		AssertFatal (Kr >= 256, "turbo algo issue Kr=%d cb_cnt=%d C=%d nbRB=%d TBSInput=%d TBSHarq=%d TBSplus24=%d mcs=%d Qm=%d RIV=%d round=%d\n",
    				Kr,r,harq_process->C,harq_process->nb_rb,A,harq_process->TBS,harq_process->B,harq_process->mcs,harq_process->Qm,harq_process->rvidx,harq_process->round);
    	}
#if UE_TIMING_TRACE
        start_meas(dlsch_turbo_decoding_stats);
#endif
//      LOG_D(PHY,"AbsSubframe %d.%d Start turbo segment %d/%d \n",frame%1024,subframe,r,harq_process->C-1);
#ifdef TD_DECODING
        ret = tc
            (&harq_process->d[r][96],
             harq_process->c[r],
             Kr,
             f1f2mat_old[iind*2],
             f1f2mat_old[(iind*2)+1],
             dlsch->max_ldpc_iterations,
             crc_type,
             (r==0) ? harq_process->F : 0,
             &phy_vars_ue->dlsch_tc_init_stats,
             &phy_vars_ue->dlsch_tc_alpha_stats,
             &phy_vars_ue->dlsch_tc_beta_stats,
             &phy_vars_ue->dlsch_tc_gamma_stats,
             &phy_vars_ue->dlsch_tc_ext_stats,
             &phy_vars_ue->dlsch_tc_intl1_stats,
             &phy_vars_ue->dlsch_tc_intl2_stats); //(is_crnti==0)?harq_pid:harq_pid+1);

      
#else

		memset(pv,0,2*p_decParams->Z*sizeof(int16_t));
        //memset(pl,0,2*p_decParams->Z*sizeof(int8_t));

		//if (A < 1000){

      	for (i=2*p_decParams->Z/8, j = 0; i < (68*p_decParams->Z/8+1); i++, j++)
      	{
      		pv[i]= _mm_loadu_si128((__m128i*)(&inv_d[8*j]));
      	}
      	/*}
		else{
      	for (i=2*p_decParams->Z/8, j = 0; i < (68*p_decParams->Z/8+1); i++, j++)
      	      	{
      	      		pv[i]= _mm_loadu_si128((__m128i*)&harq_process->d[r][96+8*j]);
      	      	}
		}*/
		for (i=0, j=0; j < (68*p_decParams->Z/16);  i+=2, j++)
      	      	{
      				//printf("mm packs i %d j %d\n", i, j);
      				//print128_num(pv[i]);
      				//print128_num(pv[i+1]);
      	      		pl[j] = _mm_packs_epi16(pv[i],pv[i+1]);
      	      		//print128_num2bytes(pl[j]);
      	      	}

		no_iteration_ldpc = nrLDPC_decoder(p_decParams,
      			&pl[0],
				llrProcBuf,
          		p_procTime);

		if (no_iteration_ldpc > 2)
			printf("Error number of iteration LPDC %d\n", no_iteration_ldpc);
		//else
			//printf("OK number of iteration LPDC %d\n", no_iteration_ldpc);

		for (int m=0; m < Kr>>3; m ++)
		      	      	{
		      				harq_process->c[r][m]= (uint8_t) llrProcBuf[m];
		      	      	}

		      	/*for (int u=0; u < Kr>>3; u ++)
		      	      	      	{
		      						ullrProcBuf[u]= (uint8_t) llrProcBuf[u];
		      	      	      	}


		      	printf("output unsigned ullrProcBuf \n");

		      	for (int j=0; j < Kr>>3; j ++)
		      	      	       	      	{

		      	      						printf(" %d \n", ullrProcBuf[j]);

		      	      	      	      	}
		     	printf(" \n");*/
#endif
		
#if UE_TIMING_TRACE
      stop_meas(dlsch_turbo_decoding_stats);
#endif
    }
#else
    if ((harq_process->C == 1) ||
	((r==harq_process->C-1) && (skipped_last==0))) { // last segment with odd number of segments

#if UE_TIMING_TRACE
        start_meas(dlsch_turbo_decoding_stats);
#endif
      ret = tc
            (&harq_process->d[r][96],
             harq_process->c[r],
             Kr,
             f1f2mat_old[iind*2],
             f1f2mat_old[(iind*2)+1],
             dlsch->max_ldpc_iterations,
             crc_type,
             (r==0) ? harq_process->F : 0,
             &phy_vars_ue->dlsch_tc_init_stats,
             &phy_vars_ue->dlsch_tc_alpha_stats,
             &phy_vars_ue->dlsch_tc_beta_stats,
             &phy_vars_ue->dlsch_tc_gamma_stats,
             &phy_vars_ue->dlsch_tc_ext_stats,
             &phy_vars_ue->dlsch_tc_intl1_stats,
             &phy_vars_ue->dlsch_tc_intl2_stats); //(is_crnti==0)?harq_pid:harq_pid+1);
 #if UE_TIMING_TRACE
      stop_meas(dlsch_turbo_decoding_stats);
#endif
      //      printf("single decode, exit\n");
      //      exit(-1);
    }
    else {
    // we can merge code segments
      if ((skipped_last == 0) && (r<harq_process->C-1)) {
	skipped_last = 1;
	Kr_last = Kr;
      }
      else {
	skipped_last=0;

	if (Kr_last == Kr) { // decode 2 code segments with AVX2 version
#ifdef DEBUG_DLSCH_DECODING
	  printf("single decoding segment %d (%p)\n",r-1,&harq_process->d[r-1][96]);
#endif
#if UE_TIMING_TRACE
	  start_meas(dlsch_turbo_decoding_stats);
#endif
#ifdef DEBUG_DLSCH_DECODING
	  printf("double decoding segments %d,%d (%p,%p)\n",r-1,r,&harq_process->d[r-1][96],&harq_process->d[r][96]);
#endif
	  ret = tc_2cw
            (&harq_process->d[r-1][96],
	     &harq_process->d[r][96],
             harq_process->c[r-1],
             harq_process->c[r],
             Kr,
             f1f2mat_old[iind*2],
             f1f2mat_old[(iind*2)+1],
             dlsch->max_ldpc_iterations,
             crc_type,
             (r==0) ? harq_process->F : 0,
             &phy_vars_ue->dlsch_tc_init_stats,
             &phy_vars_ue->dlsch_tc_alpha_stats,
             &phy_vars_ue->dlsch_tc_beta_stats,
             &phy_vars_ue->dlsch_tc_gamma_stats,
             &phy_vars_ue->dlsch_tc_ext_stats,
             &phy_vars_ue->dlsch_tc_intl1_stats,
             &phy_vars_ue->dlsch_tc_intl2_stats); //(is_crnti==0)?harq_pid:harq_pid+1);
	  /*
	  ret = tc
            (&harq_process->d[r-1][96],
             harq_process->c[r-1],
             Kr_last,
             f1f2mat_old[iind*2],
             f1f2mat_old[(iind*2)+1],
             dlsch->max_ldpc_iterations,
             crc_type,
             (r==0) ? harq_process->F : 0,
             &phy_vars_ue->dlsch_tc_init_stats,
             &phy_vars_ue->dlsch_tc_alpha_stats,
             &phy_vars_ue->dlsch_tc_beta_stats,
             &phy_vars_ue->dlsch_tc_gamma_stats,
             &phy_vars_ue->dlsch_tc_ext_stats,
             &phy_vars_ue->dlsch_tc_intl1_stats,
             &phy_vars_ue->dlsch_tc_intl2_stats); //(is_crnti==0)?harq_pid:harq_pid+1);

	     exit(-1);*/
#if UE_TIMING_TRACE
      stop_meas(dlsch_turbo_decoding_stats);
#endif
	}
	else { // Kr_last != Kr
#if UE_TIMING_TRACE
	  start_meas(dlsch_turbo_decoding_stats);
#endif
	  ret = tc
            (&harq_process->d[r-1][96],
             harq_process->c[r-1],
             Kr_last,
             f1f2mat_old[iind*2],
             f1f2mat_old[(iind*2)+1],
             dlsch->max_ldpc_iterations,
             crc_type,
             (r==0) ? harq_process->F : 0,
             &phy_vars_ue->dlsch_tc_init_stats,
             &phy_vars_ue->dlsch_tc_alpha_stats,
             &phy_vars_ue->dlsch_tc_beta_stats,
             &phy_vars_ue->dlsch_tc_gamma_stats,
             &phy_vars_ue->dlsch_tc_ext_stats,
             &phy_vars_ue->dlsch_tc_intl1_stats,
             &phy_vars_ue->dlsch_tc_intl2_stats); //(is_crnti==0)?harq_pid:harq_pid+1);
#if UE_TIMING_TRACE
      stop_meas(dlsch_turbo_decoding_stats);

	  start_meas(dlsch_turbo_decoding_stats);
#endif

	  ret = tc
            (&harq_process->d[r][96],
             harq_process->c[r],
             Kr,
             f1f2mat_old[iind*2],
             f1f2mat_old[(iind*2)+1],
             dlsch->max_ldpc_iterations,
             crc_type,
             (r==0) ? harq_process->F : 0,
             &phy_vars_ue->dlsch_tc_init_stats,
             &phy_vars_ue->dlsch_tc_alpha_stats,
             &phy_vars_ue->dlsch_tc_beta_stats,
             &phy_vars_ue->dlsch_tc_gamma_stats,
             &phy_vars_ue->dlsch_tc_ext_stats,
             &phy_vars_ue->dlsch_tc_intl1_stats,
             &phy_vars_ue->dlsch_tc_intl2_stats); //(is_crnti==0)?harq_pid:harq_pid+1);

#if UE_TIMING_TRACE

	  stop_meas(dlsch_turbo_decoding_stats);

	  /*printf("Segmentation: C %d r %d, dlsch_rate_unmatching_stats %5.3f dlsch_deinterleaving_stats %5.3f  dlsch_turbo_decoding_stats %5.3f \n",
              harq_process->C,
              r,
              dlsch_rate_unmatching_stats->p_time/(cpuf*1000.0),
              dlsch_deinterleaving_stats->p_time/(cpuf*1000.0),
              dlsch_turbo_decoding_stats->p_time/(cpuf*1000.0));*/
#endif
	}
      }
    }
#endif


    if ((err_flag == 0) && (ret>=(1+dlsch->max_ldpc_iterations))) {// a Code segment is in error so break;
//      LOG_D(PHY,"AbsSubframe %d.%d CRC failed, segment %d/%d \n",frame%1024,subframe,r,harq_process->C-1);
      err_flag = 1;
    }
  //}

  /*int32_t frame_rx_prev = frame;
  int32_t subframe_rx_prev = subframe - 1;
  if (subframe_rx_prev < 0) {
    frame_rx_prev--;
    subframe_rx_prev += 10;
  }
  frame_rx_prev = frame_rx_prev%1024;*/
#if 0
  if (err_flag == 1) {
//#if UE_DEBUG_TRACE
    LOG_I(PHY,"[UE %d] THREAD 0 DLSCH: Setting NAK for SFN/SF %d/%d (pid %d, status %d, round %d, TBS %d, mcs %d) Kr %d r %d harq_process->round %d\n",
        phy_vars_ue->Mod_id, frame, subframe, harq_pid,harq_process->status, harq_process->round,harq_process->TBS,harq_process->mcs,Kr,r,harq_process->round);
//#endif
    dlsch->harq_ack[subframe].ack = 0;
    dlsch->harq_ack[subframe].harq_id = harq_pid;
    dlsch->harq_ack[subframe].send_harq_status = 1;
    harq_process->errors[harq_process->round]++;
    harq_process->round++;


    //    printf("Rate: [UE %d] DLSCH: Setting NACK for subframe %d (pid %d, round %d)\n",phy_vars_ue->Mod_id,subframe,harq_pid,harq_process->round);
    if (harq_process->round >= dlsch->Mdlharq) {
      harq_process->status = SCH_IDLE;
      harq_process->round  = 0;
    }
/*    if(is_crnti)
    {
    LOG_D(PHY,"[UE %d] DLSCH: Setting NACK for subframe %d (pid %d, pid status %d, round %d/Max %d, TBS %d)\n",
               phy_vars_ue->Mod_id,subframe,harq_pid,harq_process->status,harq_process->round,dlsch->Mdlharq,harq_process->TBS);
    }*/


    //return((1+dlsch->max_ldpc_iterations));
  } else {
#if UE_DEBUG_TRACE
      LOG_I(PHY,"[UE %d] THREAD 0 DLSCH: Setting ACK for subframe %d TBS %d mcs %d nb_rb %d\n",
           phy_vars_ue->Mod_id,subframe,harq_process->TBS,harq_process->mcs,harq_process->nb_rb);
#endif

    harq_process->status = SCH_IDLE;
    harq_process->round  = 0;
    dlsch->harq_ack[subframe].ack = 1;
    dlsch->harq_ack[subframe].harq_id = harq_pid;
    dlsch->harq_ack[subframe].send_harq_status = 1;
    //LOG_I(PHY,"[UE %d] DLSCH: Setting ACK for SFN/SF %d/%d (pid %d, status %d, round %d, TBS %d, mcs %d)\n",
      //  phy_vars_ue->Mod_id, frame, subframe, harq_pid, harq_process->status, harq_process->round,harq_process->TBS,harq_process->mcs);

/*    if(is_crnti)
    {
    LOG_D(PHY,"[UE %d] DLSCH: Setting ACK for subframe %d (pid %d, round %d, TBS %d)\n",phy_vars_ue->Mod_id,subframe,harq_pid,harq_process->round,harq_process->TBS);
    }
    LOG_D(PHY,"[UE %d] DLSCH: Setting ACK for subframe %d (pid %d, round %d)\n",phy_vars_ue->Mod_id,subframe,harq_pid,harq_process->round);

  }*/

  // Reassembly of Transport block here
  offset = 0;

  /*
  printf("harq_pid %d\n",harq_pid);
  printf("F %d, Fbytes %d\n",harq_process->F,harq_process->F>>3);
  printf("C %d\n",harq_process->C);
  */
  for (r=0; r<harq_process->C; r++) {
    if (r<harq_process->Cminus)
      Kr = harq_process->Kminus;
    else
      Kr = harq_process->Kplus;

    Kr_bytes = Kr>>3;

    //    printf("Segment %d : Kr= %d bytes\n",r,Kr_bytes);
    if (r==0) {
      memcpy(harq_process->b,
             &harq_process->c[0][(harq_process->F>>3)],
             Kr_bytes - (harq_process->F>>3)- ((harq_process->C>1)?3:0));
      offset = Kr_bytes - (harq_process->F>>3) - ((harq_process->C>1)?3:0);
      //            printf("copied %d bytes to b sequence (harq_pid %d)\n",
      //          Kr_bytes - (harq_process->F>>3),harq_pid);
      //          printf("b[0] = %x,c[%d] = %x\n",
      //      harq_process->b[0],
      //      harq_process->F>>3,
      //      harq_process->c[0][(harq_process->F>>3)]);
    } else {
      memcpy(harq_process->b+offset,
             harq_process->c[r],
             Kr_bytes- ((harq_process->C>1)?3:0));
      offset += (Kr_bytes - ((harq_process->C>1)?3:0));
    }
  }

  dlsch->last_iteration_cnt = ret;

  //return(ret);
  }
#endif

  proc->decoder_thread_available = 1;
  //proc->decoder_main_available = 0;
  
  if (pthread_mutex_lock(&proc->mutex_dlsch_td) != 0) {
              LOG_E( PHY, "[SCHED][UE] error locking mutex for UE RXTX\n" );
              exit_fun("noting to add");
          }
          proc->instance_cnt_dlsch_td--;
          if (pthread_mutex_unlock(&proc->mutex_dlsch_td) != 0) {
              LOG_E( PHY, "[SCHED][UE] error unlocking mutex for UE td1\n" );
              exit_fun("noting to add");
          }
      }

      // thread finished
          free(arg);
          return &UE_dlsch_td_retval;
}
#endif

#ifdef UE_DLSCH_PARALLELISATION
#define FIFO_PRIORITY   39
uint32_t  dlsch_decoding_2thread1(void *arg)
{
	static __thread int UE_dlsch_td_retval1;
    struct rx_tx_thread_data *rtd = arg;
    UE_rxtx_proc_t *proc = rtd->proc;
    PHY_VARS_NR_UE    *phy_vars_ue   = rtd->UE;

    int llr8_flag1;
    int32_t no_iteration_ldpc;
    t_nrLDPC_dec_params decParams;
    t_nrLDPC_dec_params* p_decParams = &decParams;
    t_nrLDPC_time_stats procTime;
    t_nrLDPC_time_stats* p_procTime =&procTime ;
    int16_t z [68*384];
    int8_t l [68*384];
    //__m128i l;
    int16_t inv_d [68*384];
    int16_t *p_invd =&inv_d;

    uint32_t i,j;
    uint32_t k;

    __m128i *pv = (__m128i*)&z;
    __m128i *pl = (__m128i*)&l;

    proc->instance_cnt_dlsch_td1=-1;
    proc->nr_tti_rx=proc->sub_frame_start;

    printf("start thread 1\n");
    proc->decoder_thread_available1 = 0;

    char threadname[256];
    sprintf(threadname,"UE_thread_dlsch_td1_%d", proc->sub_frame_start);

	cpu_set_t cpuset;
    CPU_ZERO(&cpuset);
    if ( (proc->sub_frame_start+1)%RX_NB_TH == 0 && threads.dlsch_td_one != -1 )
    	CPU_SET(threads.dlsch_td_one, &cpuset);
    if ( (proc->sub_frame_start+1)%RX_NB_TH == 1 && threads.dlsch_td_two != -1 )
    	CPU_SET(threads.dlsch_td_two, &cpuset);
    if ( (proc->sub_frame_start+1)%RX_NB_TH == 2 && threads.dlsch_td_three != -1 )
    	CPU_SET(threads.dlsch_td_three, &cpuset);



#if UE_TIMING_TRACE
  time_stats_t *dlsch_rate_unmatching_stats=&phy_vars_ue->dlsch_rate_unmatching_stats;
  time_stats_t *dlsch_turbo_decoding_stats=&phy_vars_ue->dlsch_turbo_decoding_stats;
  time_stats_t *dlsch_deinterleaving_stats=&phy_vars_ue->dlsch_deinterleaving_stats;
#endif
  uint32_t A,E;
  uint32_t G;
  uint32_t ret,offset;
  uint16_t iind;
  //  uint8_t dummy_channel_output[(3*8*block_length)+12];
  short dummy_w[MAX_NUM_DLSCH_SEGMENTS][3*(8448+64)];
  uint32_t r,r_offset=0,Kr,Kr_bytes,err_flag=0,Kr_int;
  uint8_t crc_type;
  uint8_t C;
   uint8_t Qm;
   uint8_t Nl;
   uint32_t Er, Gp,GpmodC;
#ifdef DEBUG_DLSCH_DECODING
  uint16_t i;
#endif
  //#ifdef __AVX2__

  uint8_t (*tc)(int16_t *y,
                uint8_t *,
                uint16_t,
                uint16_t,
                uint16_t,
                uint8_t,
                uint8_t,
                uint8_t,
                time_stats_t *,
                time_stats_t *,
                time_stats_t *,
                time_stats_t *,
                time_stats_t *,
                time_stats_t *,
                time_stats_t *);
  if (llr8_flag1 == 0) {
    //#ifdef __AVX2__
#if 0
    tc_2cw = phy_threegpplte_turbo_decoder16avx2;
#endif
    tc = phy_threegpplte_turbo_decoder16;
  }
  else
  {
	  //AssertFatal (harq_process->TBS >= 256 , "Mismatch flag nbRB=%d TBS=%d mcs=%d Qm=%d RIV=%d round=%d \n",
	  //		  harq_process->nb_rb, harq_process->TBS,harq_process->mcs,harq_process->Qm,harq_process->rvidx,harq_process->round);
	    tc = phy_threegpplte_turbo_decoder8;
  }

	init_thread(900000,1000000 , FIFO_PRIORITY-1, &cpuset, threadname);
	printf("2thread1 oai_exit %d\n", oai_exit);
	while (!oai_exit) {
	        if (pthread_mutex_lock(&proc->mutex_dlsch_td1) != 0) {
	            LOG_E( PHY, "[SCHED][UE] error locking mutex for UE dlsch td\n" );
	            exit_fun("nothing to add");
	        }
	        while (proc->instance_cnt_dlsch_td1 < 0) {
	            // most of the time, the thread is waiting here
	            pthread_cond_wait( &proc->cond_dlsch_td1, &proc->mutex_dlsch_td1 );
	        }
	        if (pthread_mutex_unlock(&proc->mutex_dlsch_td1) != 0) {
	            LOG_E( PHY, "[SCHED][UE] error unlocking mutex for UE dlsch_td \n" );
	            exit_fun("nothing to add");
	        }

	        //printf("2thread1 main available %d\n", proc->decoder_main_available);

	        uint32_t wait = 0;
	        	          while(proc->decoder_main_available == 0)
	        	          {
	        	                  usleep(1);
	        	                  wait++;
	        	          }
	        	
	        	          //proc->decoder_thread_available1 = 0;

	        //PHY_VARS_NR_UE *phy_vars_ue   		= tdp->UE;
	        	int eNB_id         				= proc->eNB_id;
	        	int harq_pid      				= proc->harq_pid;
	        	llr8_flag1     					= proc->llr8_flag;
	        	//r_offset						= proc->Er;
	        	//UE_rxtx_proc_t *proc    		= tdp->proc;
	        	int frame                       = proc->frame_rx;
	        	int subframe      				= proc->nr_tti_rx;
	        	NR_UE_DLSCH_t *dlsch 			= phy_vars_ue->dlsch[phy_vars_ue->current_thread_id[subframe]][eNB_id][0];
	        	NR_DL_UE_HARQ_t *harq_process  = dlsch->harq_processes[harq_pid];
	        	short *dlsch_llr 				= phy_vars_ue->pdsch_vars[phy_vars_ue->current_thread_id[subframe]][eNB_id]->llr[0];
	        	//printf("2thread0 llr flag %d tdp flag %d\n",llr8_flag1, tdp->llr8_flag);
	        	//printf("2thread1 nr_tti_tx %d subframe %d SF thread id %d r_offset %d\n", proc->nr_tti_rx, subframe, phy_vars_ue->current_thread_id[subframe], r_offset);

  //  nb_rb = dlsch->nb_rb;

  /*
  if (nb_rb > frame_parms->N_RB_DL) {
    printf("dlsch_decoding.c: Illegal nb_rb %d\n",nb_rb);
    return(max_ldpc_iterations);
    }*/

  /*harq_pid = dlsch->current_harq_pid[phy_vars_ue->current_thread_id[subframe]];
  if (harq_pid >= 8) {
    printf("dlsch_decoding.c: Illegal harq_pid %d\n",harq_pid);
    return(max_ldpc_iterations);
  }
  */



  harq_process->trials[harq_process->round]++;

  A = harq_process->TBS; //2072 for QPSK 1/3

  ret = dlsch->max_ldpc_iterations;


  G = harq_process->G;
  //get_G(frame_parms,nb_rb,dlsch->rb_alloc,mod_order,num_pdcch_symbols,phy_vars_ue->frame,subframe);

    //printf("DLSCH Decoding,  A %d harq_pid %d G %d\n",A, harq_pid,harq_process->G);

  if (harq_process->round == 0) {
    // This is a new packet, so compute quantities regarding segmentation
    harq_process->B = A+24;
#ifdef TD_DECODING
      lte_segmentation(NULL,
                       NULL,
                       harq_process->B,
                       &harq_process->C,
                       &harq_process->Cplus,
                       &harq_process->Cminus,
                       &harq_process->Kplus,
                       &harq_process->Kminus,
                       &harq_process->F);
      //  CLEAR LLR's HERE for first packet in process
  #else
      nr_segmentation(NULL,
      	                    NULL,
      	                    harq_process->B,
      	                    &harq_process->C,
      	                    &harq_process->Kplus,
      	                    &harq_process->Kminus,
      						&harq_process->Z,
      	                    &harq_process->F);
      	p_decParams->Z = harq_process->Z;      	

  #endif

    }

    //	  p_decParams->Z = 128;
        p_decParams->BG = 1;
        p_decParams->R = 89;
        p_decParams->numMaxIter = 2;
        Kr = p_decParams->Z*22;
        p_decParams->outMode= 0;
  /*
  else {
    printf("dlsch_decoding.c: Ndi>0 not checked yet!!\n");
    return(max_ldpc_iterations);
  }
  */
  err_flag = 0;
  //r_offset = 0;

  /*
  unsigned char bw_scaling =1;

  switch (frame_parms->N_RB_DL) {
  case 6:
    bw_scaling =16;
    break;

  case 25:
    bw_scaling =4;
    break;

  case 50:
    bw_scaling =2;
    break;

  default:
    bw_scaling =1;
    break;
  }

  if (harq_process->C > MAX_NUM_DLSCH_SEGMENTS/bw_scaling) {
    LOG_E(PHY,"Illegal harq_process->C %d > %d\n",harq_process->C,MAX_NUM_DLSCH_SEGMENTS/bw_scaling);
    return((1+dlsch->max_ldpc_iterations));
  }*/
#ifdef DEBUG_DLSCH_DECODING
  printf("Segmentation: C %d, Cminus %d, Kminus %d, Kplus %d\n",harq_process->C,harq_process->Cminus,harq_process->Kminus,harq_process->Kplus);
#endif

  opp_enabled=1;
  
  Qm= harq_process->Qm;
  	       Nl=harq_process->Nl;
  	       //r_thread = harq_process->C/2-1;
  	       C= harq_process->C;

  	       Gp = G/Nl/Qm;
  	        GpmodC = Gp%C;



  	        if ((C/2-1) < (C-(GpmodC)))
  	        	r_offset = Nl*Qm * (Gp/C);
  	        else
  	        	r_offset = Nl*Qm * ((GpmodC==0?0:1) + (Gp/C));

  	       //printf("sub thread r_offset %d\n", r_offset);

  //for (r=(harq_process->C/2); r<harq_process->C; r++) {
  	     r=2; //(harq_process->C/2);
  	   r_offset = r*r_offset;
	 //printf("thread1 r=%d r_offset %d \n",r, r_offset);


    // Get Turbo interleaver parameters
#ifdef TD_DECODING
	  if (r<harq_process->Cminus)
      Kr = harq_process->Kminus;
    else
      Kr = harq_process->Kplus;
#else
	  Kr = harq_process->Kplus;
#endif

    Kr_bytes = Kr>>3;

     //workaround for nr ldpc using lte interleaving
  if (dlsch->harq_processes[harq_pid]->C >= 2)
      	Kr_int = G/(3*dlsch->harq_processes[harq_pid]->C);
  else
  	Kr_int = Kr;

    if (Kr_bytes<=64)
      iind = (Kr_bytes-5);
    else if (Kr_bytes <=128)
      iind = 59 + ((Kr_bytes-64)>>1);
    else if (Kr_bytes <= 256)
      iind = 91 + ((Kr_bytes-128)>>2);
    else if (Kr_bytes <= 768)
      iind = 123 + ((Kr_bytes-256)>>3);
    else {
      //printf("dlsch_decoding: Illegal codeword size %d!!!\n",Kr_bytes);
      //return(dlsch->max_ldpc_iterations);
    }

#ifdef DEBUG_DLSCH_DECODING
    printf("f1 %d, f2 %d, F %d\n",f1f2mat_old[2*iind],f1f2mat_old[1+(2*iind)],(r==0) ? harq_process->F : 0);
#endif

#if UE_TIMING_TRACE
    start_meas(dlsch_rate_unmatching_stats);
#endif
    memset(&dummy_w[r][0],0,3*(8448+64)*sizeof(short));
    harq_process->RTC[r] = generate_dummy_w(Kr_int,
                                            (uint8_t*) &dummy_w[r][0],
                                            (r==0) ? harq_process->F : 0);

#ifdef DEBUG_DLSCH_DECODING
    LOG_D(PHY,"HARQ_PID %d Rate Matching Segment %d (coded bits %d,unpunctured/repeated bits %d, TBS %d, mod_order %d, nb_rb %d, Nl %d, rv %d, round %d)...\n",
          harq_pid,r, G,
          Kr*3,
          harq_process->TBS,
          harq_process->Qm,
          harq_process->nb_rb,
          harq_process->Nl,
          harq_process->rvidx,
          harq_process->round);
#endif

#ifdef DEBUG_DLSCH_DECODING
    printf(" in decoding dlsch->harq_processes[harq_pid]->rvidx = %d\n", dlsch->harq_processes[harq_pid]->rvidx);
#endif
    if (lte_rate_matching_turbo_rx(harq_process->RTC[r],
                                   G,
                                   harq_process->w[r],
                                   (uint8_t*)&dummy_w[r][0],
                                   dlsch_llr+r_offset,
                                   harq_process->C,
                                   dlsch->Nsoft,
                                   dlsch->Mdlharq,
                                   dlsch->Kmimo,
                                   harq_process->rvidx,
                                   (harq_process->round==0)?1:0,
                                   harq_process->Qm,
                                   harq_process->Nl,
                                   r,
                                   &E)==-1) {
#if UE_TIMING_TRACE
      stop_meas(dlsch_rate_unmatching_stats);
#endif
      LOG_E(PHY,"dlsch_decoding.c: Problem in rate_matching\n");
      //return(dlsch->max_ldpc_iterations);
    } else
    {
#if UE_TIMING_TRACE
      stop_meas(dlsch_rate_unmatching_stats);
#endif
    }
    r_offset += E;

    /*
    printf("Subblock deinterleaving, d %p w %p\n",
     harq_process->d[r],
     harq_process->w);
    */
#if UE_TIMING_TRACE
    start_meas(dlsch_deinterleaving_stats);
#endif
    sub_block_deinterleaving_turbo(4+Kr,
                                   &harq_process->d[r][96],

                                   harq_process->w[r]);
#if UE_TIMING_TRACE
    stop_meas(dlsch_deinterleaving_stats);
#endif
#ifdef DEBUG_DLSCH_DECODING
    /*
    if (r==0) {
              write_output("decoder_llr.m","decllr",dlsch_llr,G,1,0);
              write_output("decoder_in.m","dec",&harq_process->d[0][96],(3*8*Kr_bytes)+12,1,0);
    }

    printf("decoder input(segment %d) :",r);
    int i; for (i=0;i<(3*8*Kr_bytes)+12;i++)
      printf("%d : %d\n",i,harq_process->d[r][96+i]);
      printf("\n");*/
#endif


    //    printf("Clearing c, %p\n",harq_process->c[r]);
    memset(harq_process->c[r],0,Kr_bytes);

    //    printf("done\n");
    if (harq_process->C == 1)
      crc_type = CRC24_A;
    else
      crc_type = CRC24_B;

    /*
    printf("decoder input(segment %d)\n",r);
    for (i=0;i<(3*8*Kr_bytes)+12;i++)
      if ((harq_process->d[r][96+i]>7) ||
    (harq_process->d[r][96+i] < -8))
    printf("%d : %d\n",i,harq_process->d[r][96+i]);
    printf("\n");
    */

    //#ifndef __AVX2__
#if 1
    if (err_flag == 0) {
/*
        LOG_I(PHY, "turbo algo Kr=%d cb_cnt=%d C=%d nbRB=%d crc_type %d TBSInput=%d TBSHarq=%d TBSplus24=%d mcs=%d Qm=%d RIV=%d round=%d maxIter %d\n",
                            Kr,r,harq_process->C,harq_process->nb_rb,crc_type,A,harq_process->TBS,
                            harq_process->B,harq_process->mcs,harq_process->Qm,harq_process->rvidx,harq_process->round,dlsch->max_ldpc_iterations);
*/
    	if (llr8_flag1) {
    		AssertFatal (Kr >= 256, "turbo algo issue Kr=%d cb_cnt=%d C=%d nbRB=%d TBSInput=%d TBSHarq=%d TBSplus24=%d mcs=%d Qm=%d RIV=%d round=%d\n",
    				Kr,r,harq_process->C,harq_process->nb_rb,A,harq_process->TBS,harq_process->B,harq_process->mcs,harq_process->Qm,harq_process->rvidx,harq_process->round);
    	}
#if UE_TIMING_TRACE
        start_meas(dlsch_turbo_decoding_stats);
#endif
//      LOG_D(PHY,"AbsSubframe %d.%d Start turbo segment %d/%d \n",frame%1024,subframe,r,harq_process->C-1);
#ifdef TD_DECODING
        ret = tc
            (&harq_process->d[r][96],
             harq_process->c[r],
             Kr,
             f1f2mat_old[iind*2],
             f1f2mat_old[(iind*2)+1],
             dlsch->max_ldpc_iterations,
             crc_type,
             (r==0) ? harq_process->F : 0,
             &phy_vars_ue->dlsch_tc_init_stats,
             &phy_vars_ue->dlsch_tc_alpha_stats,
             &phy_vars_ue->dlsch_tc_beta_stats,
             &phy_vars_ue->dlsch_tc_gamma_stats,
             &phy_vars_ue->dlsch_tc_ext_stats,
             &phy_vars_ue->dlsch_tc_intl1_stats,
             &phy_vars_ue->dlsch_tc_intl2_stats); //(is_crnti==0)?harq_pid:harq_pid+1);

#else

		memset(pv,0,2*p_decParams->Z*sizeof(int16_t));
        //memset(pl,0,2*p_decParams->Z*sizeof(int8_t));

	
      	for (i=2*p_decParams->Z/8, j = 0; i < (68*p_decParams->Z/8+1); i++, j++)
      	{
      		pv[i]= _mm_loadu_si128((__m128i*)(&inv_d[8*j]));
      	}
      	
		for (i=0, j=0; j < (68*p_decParams->Z/16);  i+=2, j++)
      	      	{
      				//printf("mm packs i %d j %d\n", i, j);
      				//print128_num(pv[i]);
      				//print128_num(pv[i+1]);
      	      		pl[j] = _mm_packs_epi16(pv[i],pv[i+1]);
      	      		//print128_num2bytes(pl[j]);
      	      	}

		no_iteration_ldpc = nrLDPC_decoder(p_decParams,
      			&pl[0],
				llrProcBuf,
          		p_procTime);

		if (no_iteration_ldpc > 2)
			printf("Error number of iteration LPDC %d\n", no_iteration_ldpc);
		//else
		//	printf("OK number of iteration LPDC %d\n", no_iteration_ldpc);

		for (int m=0; m < Kr>>3; m ++)
		      	      	{
		      				harq_process->c[r][m]= (uint8_t) llrProcBuf[m];
		      	      	}

		      	/*for (int u=0; u < Kr>>3; u ++)
		      	      	      	{
		      						ullrProcBuf[u]= (uint8_t) llrProcBuf[u];
		      	      	      	}


		      	printf("output unsigned ullrProcBuf \n");

		      	for (int j=0; j < Kr>>3; j ++)
		      	      	       	      	{

		      	      						printf(" %d \n", ullrProcBuf[j]);

		      	      	      	      	}
		     	printf(" \n");*/
#endif
		//printf("output channel decoder %d %d %d %d %d \n", harq_process->c[r][0], harq_process->c[r][1], harq_process->c[r][2],harq_process->c[r][3], harq_process->c[r][4]);

		 //printf("output decoder %d %d %d %d %d \n", harq_process->c[r][0], harq_process->c[r][1], harq_process->c[r][2],harq_process->c[r][3], harq_process->c[r][4]);

#if UE_TIMING_TRACE
      stop_meas(dlsch_turbo_decoding_stats);
#endif
    }
#else
    if ((harq_process->C == 1) ||
	((r==harq_process->C-1) && (skipped_last==0))) { // last segment with odd number of segments

#if UE_TIMING_TRACE
        start_meas(dlsch_turbo_decoding_stats);
#endif
      ret = tc
            (&harq_process->d[r][96],
             harq_process->c[r],
             Kr,
             f1f2mat_old[iind*2],
             f1f2mat_old[(iind*2)+1],
             dlsch->max_ldpc_iterations,
             crc_type,
             (r==0) ? harq_process->F : 0,
             &phy_vars_ue->dlsch_tc_init_stats,
             &phy_vars_ue->dlsch_tc_alpha_stats,
             &phy_vars_ue->dlsch_tc_beta_stats,
             &phy_vars_ue->dlsch_tc_gamma_stats,
             &phy_vars_ue->dlsch_tc_ext_stats,
             &phy_vars_ue->dlsch_tc_intl1_stats,
             &phy_vars_ue->dlsch_tc_intl2_stats); //(is_crnti==0)?harq_pid:harq_pid+1);
 #if UE_TIMING_TRACE
      stop_meas(dlsch_turbo_decoding_stats);
#endif
      //      printf("single decode, exit\n");
      //      exit(-1);
    }
    else {
    // we can merge code segments
      if ((skipped_last == 0) && (r<harq_process->C-1)) {
	skipped_last = 1;
	Kr_last = Kr;
      }
      else {
	skipped_last=0;

	if (Kr_last == Kr) { // decode 2 code segments with AVX2 version
#ifdef DEBUG_DLSCH_DECODING
	  printf("single decoding segment %d (%p)\n",r-1,&harq_process->d[r-1][96]);
#endif
#if UE_TIMING_TRACE
	  start_meas(dlsch_turbo_decoding_stats);
#endif
#ifdef DEBUG_DLSCH_DECODING
	  printf("double decoding segments %d,%d (%p,%p)\n",r-1,r,&harq_process->d[r-1][96],&harq_process->d[r][96]);
#endif
	  ret = tc_2cw
            (&harq_process->d[r-1][96],
	     &harq_process->d[r][96],
             harq_process->c[r-1],
             harq_process->c[r],
             Kr,
             f1f2mat_old[iind*2],
             f1f2mat_old[(iind*2)+1],
             dlsch->max_ldpc_iterations,
             crc_type,
             (r==0) ? harq_process->F : 0,
             &phy_vars_ue->dlsch_tc_init_stats,
             &phy_vars_ue->dlsch_tc_alpha_stats,
             &phy_vars_ue->dlsch_tc_beta_stats,
             &phy_vars_ue->dlsch_tc_gamma_stats,
             &phy_vars_ue->dlsch_tc_ext_stats,
             &phy_vars_ue->dlsch_tc_intl1_stats,
             &phy_vars_ue->dlsch_tc_intl2_stats); //(is_crnti==0)?harq_pid:harq_pid+1);
	  /*
	  ret = tc
            (&harq_process->d[r-1][96],
             harq_process->c[r-1],
             Kr_last,
             f1f2mat_old[iind*2],
             f1f2mat_old[(iind*2)+1],
             dlsch->max_ldpc_iterations,
             crc_type,
             (r==0) ? harq_process->F : 0,
             &phy_vars_ue->dlsch_tc_init_stats,
             &phy_vars_ue->dlsch_tc_alpha_stats,
             &phy_vars_ue->dlsch_tc_beta_stats,
             &phy_vars_ue->dlsch_tc_gamma_stats,
             &phy_vars_ue->dlsch_tc_ext_stats,
             &phy_vars_ue->dlsch_tc_intl1_stats,
             &phy_vars_ue->dlsch_tc_intl2_stats); //(is_crnti==0)?harq_pid:harq_pid+1);

	     exit(-1);*/
#if UE_TIMING_TRACE
      stop_meas(dlsch_turbo_decoding_stats);
#endif
	}
	else { // Kr_last != Kr
#if UE_TIMING_TRACE
	  start_meas(dlsch_turbo_decoding_stats);
#endif
	  ret = tc
            (&harq_process->d[r-1][96],
             harq_process->c[r-1],
             Kr_last,
             f1f2mat_old[iind*2],
             f1f2mat_old[(iind*2)+1],
             dlsch->max_ldpc_iterations,
             crc_type,
             (r==0) ? harq_process->F : 0,
             &phy_vars_ue->dlsch_tc_init_stats,
             &phy_vars_ue->dlsch_tc_alpha_stats,
             &phy_vars_ue->dlsch_tc_beta_stats,
             &phy_vars_ue->dlsch_tc_gamma_stats,
             &phy_vars_ue->dlsch_tc_ext_stats,
             &phy_vars_ue->dlsch_tc_intl1_stats,
             &phy_vars_ue->dlsch_tc_intl2_stats); //(is_crnti==0)?harq_pid:harq_pid+1);
#if UE_TIMING_TRACE
      stop_meas(dlsch_turbo_decoding_stats);

	  start_meas(dlsch_turbo_decoding_stats);
#endif

	  ret = tc
            (&harq_process->d[r][96],
             harq_process->c[r],
             Kr,
             f1f2mat_old[iind*2],
             f1f2mat_old[(iind*2)+1],
             dlsch->max_ldpc_iterations,
             crc_type,
             (r==0) ? harq_process->F : 0,
             &phy_vars_ue->dlsch_tc_init_stats,
             &phy_vars_ue->dlsch_tc_alpha_stats,
             &phy_vars_ue->dlsch_tc_beta_stats,
             &phy_vars_ue->dlsch_tc_gamma_stats,
             &phy_vars_ue->dlsch_tc_ext_stats,
             &phy_vars_ue->dlsch_tc_intl1_stats,
             &phy_vars_ue->dlsch_tc_intl2_stats); //(is_crnti==0)?harq_pid:harq_pid+1);

#if UE_TIMING_TRACE

	  stop_meas(dlsch_turbo_decoding_stats);

	  /*printf("Segmentation: C %d r %d, dlsch_rate_unmatching_stats %5.3f dlsch_deinterleaving_stats %5.3f  dlsch_turbo_decoding_stats %5.3f \n",
              harq_process->C,
              r,
              dlsch_rate_unmatching_stats->p_time/(cpuf*1000.0),
              dlsch_deinterleaving_stats->p_time/(cpuf*1000.0),
              dlsch_turbo_decoding_stats->p_time/(cpuf*1000.0));*/
#endif
	}
      }
    }
#endif


    if ((err_flag == 0) && (ret>=(1+dlsch->max_ldpc_iterations))) {// a Code segment is in error so break;
//      LOG_D(PHY,"AbsSubframe %d.%d CRC failed, segment %d/%d \n",frame%1024,subframe,r,harq_process->C-1);
      err_flag = 1;
    }
  //}

  /*int32_t frame_rx_prev = frame;
  int32_t subframe_rx_prev = subframe - 1;
  if (subframe_rx_prev < 0) {
    frame_rx_prev--;
    subframe_rx_prev += 10;
  }
  frame_rx_prev = frame_rx_prev%1024;*/
#if 0
  if (err_flag == 1) {
//#if UE_DEBUG_TRACE
    LOG_I(PHY,"[UE %d] THREAD 0 DLSCH: Setting NAK for SFN/SF %d/%d (pid %d, status %d, round %d, TBS %d, mcs %d) Kr %d r %d harq_process->round %d\n",
        phy_vars_ue->Mod_id, frame, subframe, harq_pid,harq_process->status, harq_process->round,harq_process->TBS,harq_process->mcs,Kr,r,harq_process->round);
//#endif
    dlsch->harq_ack[subframe].ack = 0;
    dlsch->harq_ack[subframe].harq_id = harq_pid;
    dlsch->harq_ack[subframe].send_harq_status = 1;
    harq_process->errors[harq_process->round]++;
    harq_process->round++;


    //    printf("Rate: [UE %d] DLSCH: Setting NACK for subframe %d (pid %d, round %d)\n",phy_vars_ue->Mod_id,subframe,harq_pid,harq_process->round);
    if (harq_process->round >= dlsch->Mdlharq) {
      harq_process->status = SCH_IDLE;
      harq_process->round  = 0;
    }
/*    if(is_crnti)
    {
    LOG_D(PHY,"[UE %d] DLSCH: Setting NACK for subframe %d (pid %d, pid status %d, round %d/Max %d, TBS %d)\n",
               phy_vars_ue->Mod_id,subframe,harq_pid,harq_process->status,harq_process->round,dlsch->Mdlharq,harq_process->TBS);
    }*/


    //return((1+dlsch->max_ldpc_iterations));
  } else {
#if UE_DEBUG_TRACE
      LOG_I(PHY,"[UE %d] THREAD 0 DLSCH: Setting ACK for subframe %d TBS %d mcs %d nb_rb %d\n",
           phy_vars_ue->Mod_id,subframe,harq_process->TBS,harq_process->mcs,harq_process->nb_rb);
#endif

    harq_process->status = SCH_IDLE;
    harq_process->round  = 0;
    dlsch->harq_ack[subframe].ack = 1;
    dlsch->harq_ack[subframe].harq_id = harq_pid;
    dlsch->harq_ack[subframe].send_harq_status = 1;
    //LOG_I(PHY,"[UE %d] DLSCH: Setting ACK for SFN/SF %d/%d (pid %d, status %d, round %d, TBS %d, mcs %d)\n",
      //  phy_vars_ue->Mod_id, frame, subframe, harq_pid, harq_process->status, harq_process->round,harq_process->TBS,harq_process->mcs);

/*    if(is_crnti)
    {
    LOG_D(PHY,"[UE %d] DLSCH: Setting ACK for subframe %d (pid %d, round %d, TBS %d)\n",phy_vars_ue->Mod_id,subframe,harq_pid,harq_process->round,harq_process->TBS);
    }
    LOG_D(PHY,"[UE %d] DLSCH: Setting ACK for subframe %d (pid %d, round %d)\n",phy_vars_ue->Mod_id,subframe,harq_pid,harq_process->round);

  }*/

  // Reassembly of Transport block here
  offset = 0;

  /*
  printf("harq_pid %d\n",harq_pid);
  printf("F %d, Fbytes %d\n",harq_process->F,harq_process->F>>3);
  printf("C %d\n",harq_process->C);
  */
  for (r=0; r<harq_process->C; r++) {
    if (r<harq_process->Cminus)
      Kr = harq_process->Kminus;
    else
      Kr = harq_process->Kplus;

    Kr_bytes = Kr>>3;

    //    printf("Segment %d : Kr= %d bytes\n",r,Kr_bytes);
    if (r==0) {
      memcpy(harq_process->b,
             &harq_process->c[0][(harq_process->F>>3)],
             Kr_bytes - (harq_process->F>>3)- ((harq_process->C>1)?3:0));
      offset = Kr_bytes - (harq_process->F>>3) - ((harq_process->C>1)?3:0);
      //            printf("copied %d bytes to b sequence (harq_pid %d)\n",
      //          Kr_bytes - (harq_process->F>>3),harq_pid);
      //          printf("b[0] = %x,c[%d] = %x\n",
      //      harq_process->b[0],
      //      harq_process->F>>3,
      //      harq_process->c[0][(harq_process->F>>3)]);
    } else {
      memcpy(harq_process->b+offset,
             harq_process->c[r],
             Kr_bytes- ((harq_process->C>1)?3:0));
      offset += (Kr_bytes - ((harq_process->C>1)?3:0));
    }
  }

  dlsch->last_iteration_cnt = ret;

  //return(ret);
  }
#endif

  proc->decoder_thread_available1 = 1;
  //proc->decoder_main_available = 0;

  //printf("2thread1 proc->instance_cnt_dlsch_td1 %d\n", proc->instance_cnt_dlsch_td1);

  if (pthread_mutex_lock(&proc->mutex_dlsch_td1) != 0) {
              LOG_E( PHY, "[SCHED][UE] error locking mutex for UE RXTX\n" );
              exit_fun("noting to add");
          }
          proc->instance_cnt_dlsch_td1--;
          if (pthread_mutex_unlock(&proc->mutex_dlsch_td1) != 0) {
              LOG_E( PHY, "[SCHED][UE] error unlocking mutex for UE td1\n" );
              exit_fun("noting to add");
          }
          //printf("end 2thread1 proc->instance_cnt_dlsch_td1 %d\n", proc->instance_cnt_dlsch_td1);
      }

	//printf("after 2thread1 after oai exit proc->instance_cnt_dlsch_td %d\n", proc->instance_cnt_dlsch_td1);
      // thread finished
          free(arg);
          return &UE_dlsch_td_retval1;
}
#endif