debugging and testing of AVX2-optimized DFTS (radix 2/4 only).

5dbe3bef · Raymond Knopp · 47a6fa6e · 5dbe3bef · 5dbe3bef · 5dbe3bef
Commit 5dbe3bef authored Dec 13, 2015 by Raymond Knopp
9 changed files
--- a/cmake_targets/CMakeLists.txt
+++ b/cmake_targets/CMakeLists.txt
@@ -134,7 +134,7 @@ else (CMAKE_SYSTEM_PROCESSOR STREQUAL "armv7l")
    set(C_FLAGS_PROCESSOR "${C_FLAGS_PROCESSOR} -mavx2")
  endif()
  if (CPUINFO MATCHES "sse4_2")
-    set(C_FLAGS_PROCESSOR "${C_FLAGS_PROCESSOR} -msse4.2")
+    set(C_FLAGS_PROCESSOR "${C_FLAGS_PROCESSOR} -mavx2 -msse4.2")
  endif()
  if (CPUINFO MATCHES "sse4_1")
    set(C_FLAGS_PROCESSOR "${C_FLAGS_PROCESSOR} -msse4.1")

--- a/openair1/PHY/LTE_ESTIMATION/lte_dl_channel_estimation.c
+++ b/openair1/PHY/LTE_ESTIMATION/lte_dl_channel_estimation.c
@@ -203,8 +203,9 @@ int lte_dl_channel_estimation(PHY_VARS_UE *phy_vars_ue,
      multadd_complex_vector_real_scalar(dl_ch-(phy_vars_ue->lte_frame_parms.ofdm_symbol_size<<1),
                                         phy_vars_ue->ch_est_alpha,dl_ch-(phy_vars_ue->lte_frame_parms.ofdm_symbol_size<<1),
                                         1,phy_vars_ue->lte_frame_parms.ofdm_symbol_size);
-
-
+#ifdef DEBUG_CH
+    printf("k %d, first_carrier %d\n",k,phy_vars_ue->lte_frame_parms.first_carrier_offset);
+#endif
    if ((phy_vars_ue->lte_frame_parms.N_RB_DL==6)  ||
        (phy_vars_ue->lte_frame_parms.N_RB_DL==50) ||
        (phy_vars_ue->lte_frame_parms.N_RB_DL==100)) {
@@ -213,7 +214,9 @@ int lte_dl_channel_estimation(PHY_VARS_UE *phy_vars_ue,
      // Treat first 2 pilots specially (left edge)
      ch[0] = (int16_t)(((int32_t)pil[0]*rxF[0] - (int32_t)pil[1]*rxF[1])>>15);
      ch[1] = (int16_t)(((int32_t)pil[0]*rxF[1] + (int32_t)pil[1]*rxF[0])>>15);
-      // printf("pilot 0 : rxF - > (%d,%d) ch -> (%d,%d), pil -> (%d,%d) \n",rxF[0],rxF[1],ch[0],ch[1],pil[0],pil[1]);
+#ifdef DEBUG_CH
+      printf("pilot 0 : rxF - > (%d,%d) ch -> (%d,%d), pil -> (%d,%d) \n",rxF[0],rxF[1],ch[0],ch[1],pil[0],pil[1]);
+#endif
      multadd_real_vector_complex_scalar(fl,
                                         ch,
                                         dl_ch,
@@ -224,7 +227,9 @@ int lte_dl_channel_estimation(PHY_VARS_UE *phy_vars_ue,

      ch[0] = (int16_t)(((int32_t)pil[0]*rxF[0] - (int32_t)pil[1]*rxF[1])>>15);
      ch[1] = (int16_t)(((int32_t)pil[0]*rxF[1] + (int32_t)pil[1]*rxF[0])>>15);
-      // printf("pilot 1 : rxF - > (%d,%d) ch -> (%d,%d), pil -> (%d,%d) \n",rxF[0],rxF[1],ch[0],ch[1],pil[0],pil[1]);
+#ifdef DEBUG_CH
+      printf("pilot 1 : rxF - > (%d,%d) ch -> (%d,%d), pil -> (%d,%d) \n",rxF[0],rxF[1],ch[0],ch[1],pil[0],pil[1]);
+#endif
      multadd_real_vector_complex_scalar(f2l2,
                                         ch,
                                         dl_ch,
@@ -235,15 +240,13 @@ int lte_dl_channel_estimation(PHY_VARS_UE *phy_vars_ue,

      for (pilot_cnt=2; pilot_cnt<((phy_vars_ue->lte_frame_parms.N_RB_DL)-1); pilot_cnt+=2) {

-        // printf("%d\n",dl_ch-(int16_t *)&dl_ch_estimates[(p<<1)+aarx][ch_offset]);
-
-        //  printf("pilot[%d][%d] (%d,%d)\n",p,pilot_cnt,pil[0],pil[1]);
-        //  printf("rx[%d] -> (%d,%d)\n", k, rxF[0], rxF[1]);


        ch[0] = (int16_t)(((int32_t)pil[0]*rxF[0] - (int32_t)pil[1]*rxF[1])>>15); //Re
        ch[1] = (int16_t)(((int32_t)pil[0]*rxF[1] + (int32_t)pil[1]*rxF[0])>>15); //Im
-        // printf("**rb %d %d\n",rb,dl_ch-(int16_t *)&dl_ch_estimates[(p<<1)+aarx][ch_offset]);
+#ifdef DEBUG_CH
+	printf("pilot %d : rxF - > (%d,%d) ch -> (%d,%d), pil -> (%d,%d) \n",pilot_cnt,rxF[0],rxF[1],ch[0],ch[1],pil[0],pil[1]);
+#endif
        multadd_real_vector_complex_scalar(f,
                                           ch,
                                           dl_ch,
@@ -254,13 +257,11 @@ int lte_dl_channel_estimation(PHY_VARS_UE *phy_vars_ue,
        rxF+=12;
        dl_ch+=8;

-        // printf("pilot[%d][%d] (%d,%d)\n",p,rb,pil[0],pil[1]);
-        // printf("rx[%d] -> (%d,%d)\n", k+6, rxF[0], rxF[1]);
-
-
        ch[0] = (int16_t)(((int32_t)pil[0]*rxF[0] - (int32_t)pil[1]*rxF[1])>>15);
        ch[1] = (int16_t)(((int32_t)pil[0]*rxF[1] + (int32_t)pil[1]*rxF[0])>>15);
-        // printf("**rb %d %d\n",rb,dl_ch-(int16_t *)&dl_ch_estimates[(p<<1)+aarx][ch_offset]);
+#ifdef DEBUG_CH
+	printf("pilot %d : rxF - > (%d,%d) ch -> (%d,%d), pil -> (%d,%d) \n",pilot_cnt+1,rxF[0],rxF[1],ch[0],ch[1],pil[0],pil[1]);
+#endif
        multadd_real_vector_complex_scalar(f2,
                                           ch,
                                           dl_ch,
@@ -281,15 +282,17 @@ int lte_dl_channel_estimation(PHY_VARS_UE *phy_vars_ue,

      rxF   = (int16_t *)&rxdataF[aarx][((symbol_offset+1+k))];

+#ifdef DEBUG_CH
+      printf("second half k %d\n",k);
+#endif
      for (pilot_cnt=0; pilot_cnt<((phy_vars_ue->lte_frame_parms.N_RB_DL)-3); pilot_cnt+=2) {
-        //  printf("pilot[%d][%d] (%d,%d)\n",p,pilot_cnt,pil[0],pil[1]);
-        //  printf("rx[%d] -> (%d,%d)\n", k+6, rxF[0], rxF[1]);


        ch[0] = (int16_t)(((int32_t)pil[0]*rxF[0] - (int32_t)pil[1]*rxF[1])>>15);
        ch[1] = (int16_t)(((int32_t)pil[0]*rxF[1] + (int32_t)pil[1]*rxF[0])>>15);
-
-        //   printf("**rb %d %d\n",rb,dl_ch-(int16_t *)&dl_ch_estimates[(p<<1)+aarx][ch_offset]);
+#ifdef DEBUG_CH
+	printf("pilot %d : rxF - > (%d,%d) ch -> (%d,%d), pil -> (%d,%d) \n",pilot_cnt,rxF[0],rxF[1],ch[0],ch[1],pil[0],pil[1]);
+#endif
        multadd_real_vector_complex_scalar(f,
                                           ch,
                                           dl_ch,
@@ -300,8 +303,9 @@ int lte_dl_channel_estimation(PHY_VARS_UE *phy_vars_ue,

        ch[0] = (int16_t)(((int32_t)pil[0]*rxF[0] - (int32_t)pil[1]*rxF[1])>>15);
        ch[1] = (int16_t)(((int32_t)pil[0]*rxF[1] + (int32_t)pil[1]*rxF[0])>>15);
-
-        //   printf("**rb %d %d\n",rb,dl_ch-(int16_T *)&dl_ch_estimates[(p<<1)+aarx][ch_offset]);
+#ifdef DEBUG_CH
+	printf("pilot %d : rxF - > (%d,%d) ch -> (%d,%d), pil -> (%d,%d) \n",pilot_cnt+1,rxF[0],rxF[1],ch[0],ch[1],pil[0],pil[1]);
+#endif
        multadd_real_vector_complex_scalar(f2,
                                           ch,
                                           dl_ch,
@@ -314,8 +318,9 @@ int lte_dl_channel_estimation(PHY_VARS_UE *phy_vars_ue,

      ch[0] = (int16_t)(((int32_t)pil[0]*rxF[0] - (int32_t)pil[1]*rxF[1])>>15);
      ch[1] = (int16_t)(((int32_t)pil[0]*rxF[1] + (int32_t)pil[1]*rxF[0])>>15);
-      //            printf("pilot 49: rxF -> (%d,%d) ch -> (%d,%d), pil -> (%d,%d) \n",rxF[0],rxF[1],ch[0],ch[1],pil[0],pil[1]);
-
+#ifdef DEBUG_CH
+      printf("pilot %d: rxF -> (%d,%d) ch -> (%d,%d), pil -> (%d,%d) \n",pilot_cnt,rxF[0],rxF[1],ch[0],ch[1],pil[0],pil[1]);
+#endif
      multadd_real_vector_complex_scalar(fr,
                                         ch,
                                         dl_ch,
@@ -326,7 +331,9 @@ int lte_dl_channel_estimation(PHY_VARS_UE *phy_vars_ue,

      ch[0] = (int16_t)(((int32_t)pil[0]*rxF[0] - (int32_t)pil[1]*rxF[1])>>15);
      ch[1] = (int16_t)(((int32_t)pil[0]*rxF[1] + (int32_t)pil[1]*rxF[0])>>15);
-      //             printf("pilot 50: rxF - > (%d,%d) ch -> (%d,%d), pil -> (%d,%d) \n",rxF[0],rxF[1],ch[0],ch[1],pil[0],pil[1]);
+#ifdef DEBUG_CH
+      printf("pilot %d: rxF - > (%d,%d) ch -> (%d,%d), pil -> (%d,%d) \n",pilot_cnt+1,rxF[0],rxF[1],ch[0],ch[1],pil[0],pil[1]);
+#endif
      multadd_real_vector_complex_scalar(f2r2,
                                         ch,
                                         dl_ch,

--- a/openair1/PHY/LTE_TRANSPORT/dlsch_decoding.c
+++ b/openair1/PHY/LTE_TRANSPORT/dlsch_decoding.c
@@ -116,7 +116,7 @@ LTE_UE_DLSCH_t *new_ue_dlsch(uint8_t Kmimo,uint8_t Mdlharq,uint8_t max_turbo_ite
    dlsch->max_turbo_iterations = max_turbo_iterations;

    for (i=0; i<Mdlharq; i++) {
-      //      msg("new_ue_dlsch: Harq process %d\n",i);
+      //      printf("new_ue_dlsch: Harq process %d\n",i);
      dlsch->harq_processes[i] = (LTE_DL_UE_HARQ_t *)malloc16(sizeof(LTE_DL_UE_HARQ_t));

      if (dlsch->harq_processes[i]) {
@@ -155,7 +155,7 @@ LTE_UE_DLSCH_t *new_ue_dlsch(uint8_t Kmimo,uint8_t Mdlharq,uint8_t max_turbo_ite
      return(dlsch);
  }

-  msg("new_ue_dlsch with size %zu: exit_flag = %u\n",sizeof(LTE_DL_UE_HARQ_t), exit_flag);
+  printf("new_ue_dlsch with size %zu: exit_flag = %u\n",sizeof(LTE_DL_UE_HARQ_t), exit_flag);
  free_ue_dlsch(dlsch);

  return(NULL);
@@ -204,22 +204,22 @@ uint32_t  dlsch_decoding(PHY_VARS_UE *phy_vars_ue,
                time_stats_t *);

  if (!dlsch_llr) {
-    msg("dlsch_decoding.c: NULL dlsch_llr pointer\n");
+    printf("dlsch_decoding.c: NULL dlsch_llr pointer\n");
    return(dlsch->max_turbo_iterations);
  }

  if (!harq_process) {
-    msg("dlsch_decoding.c: NULL harq_process pointer\n");
+    printf("dlsch_decoding.c: NULL harq_process pointer\n");
    return(dlsch->max_turbo_iterations);
  }

  if (!frame_parms) {
-    msg("dlsch_decoding.c: NULL frame_parms pointer\n");
+    printf("dlsch_decoding.c: NULL frame_parms pointer\n");
    return(dlsch->max_turbo_iterations);
  }

  if (subframe>9) {
-    msg("dlsch_decoding.c: Illegal subframe index %d\n",subframe);
+    printf("dlsch_decoding.c: Illegal subframe index %d\n",subframe);
    return(dlsch->max_turbo_iterations);
  }

@@ -232,13 +232,13 @@ uint32_t  dlsch_decoding(PHY_VARS_UE *phy_vars_ue,

  /*
  if (nb_rb > frame_parms->N_RB_DL) {
-    msg("dlsch_decoding.c: Illegal nb_rb %d\n",nb_rb);
+    printf("dlsch_decoding.c: Illegal nb_rb %d\n",nb_rb);
    return(max_turbo_iterations);
    }*/

  /*harq_pid = dlsch->current_harq_pid;
  if (harq_pid >= 8) {
-    msg("dlsch_decoding.c: Illegal harq_pid %d\n",harq_pid);
+    printf("dlsch_decoding.c: Illegal harq_pid %d\n",harq_pid);
    return(max_turbo_iterations);
  }
  */
@@ -250,7 +250,7 @@ uint32_t  dlsch_decoding(PHY_VARS_UE *phy_vars_ue,
  G = harq_process->G;
  //get_G(frame_parms,nb_rb,dlsch->rb_alloc,mod_order,num_pdcch_symbols,phy_vars_ue->frame,subframe);

-  //  msg("DLSCH Decoding, harq_pid %d Ndi %d\n",harq_pid,harq_process->Ndi);
+  //  printf("DLSCH Decoding, harq_pid %d Ndi %d\n",harq_pid,harq_process->Ndi);

  if (harq_process->round == 0) {
    // This is a new packet, so compute quantities regarding segmentation
@@ -269,7 +269,7 @@ uint32_t  dlsch_decoding(PHY_VARS_UE *phy_vars_ue,

  /*
  else {
-    msg("dlsch_decoding.c: Ndi>0 not checked yet!!\n");
+    printf("dlsch_decoding.c: Ndi>0 not checked yet!!\n");
    return(max_turbo_iterations);
  }
  */
@@ -296,7 +296,7 @@ uint32_t  dlsch_decoding(PHY_VARS_UE *phy_vars_ue,
    break;
  }

-  if (harq_process->C >= MAX_NUM_DLSCH_SEGMENTS/bw_scaling) {
+  if (harq_process->C > MAX_NUM_DLSCH_SEGMENTS/bw_scaling) {
    LOG_E(PHY,"Illegal harq_process->C %d > %d\n",harq_process->C,MAX_NUM_DLSCH_SEGMENTS/bw_scaling);
    return((1+dlsch->max_turbo_iterations));
  }
@@ -320,7 +320,7 @@ uint32_t  dlsch_decoding(PHY_VARS_UE *phy_vars_ue,
    else if (Kr_bytes <= 768)
      iind = 123 + ((Kr_bytes-256)>>3);
    else {
-      msg("dlsch_decoding: Illegal codeword size %d!!!\n",Kr_bytes);
+      printf("dlsch_decoding: Illegal codeword size %d!!!\n",Kr_bytes);
      return(dlsch->max_turbo_iterations);
    }


--- a/openair1/PHY/MODULATION/ofdm_mod.c
+++ b/openair1/PHY/MODULATION/ofdm_mod.c
@@ -91,7 +91,7 @@ void PHY_ofdm_mod(int *input,                       /// pointer to complex input
                 )
 {

-  static short temp[2048*4] __attribute__((aligned(16)));
+  static short temp[2048*4] __attribute__((aligned(32)));
  unsigned short i,j;
  short k;

@@ -139,12 +139,18 @@ void PHY_ofdm_mod(int *input,                       /// pointer to complex input
    msg("[PHY] symbol %d/%d (%p,%p -> %p)\n",i,nb_symbols,input,&input[i<<log2fftsize],&output[(i<<log2fftsize) + ((i)*nb_prefix_samples)]);
 #endif

+#ifndef __AVX2__
+    // handle 128-bit alignment for 128-bit SIMD (SSE4,NEON,AltiVEC)
    idft((int16_t *)&input[i<<log2fftsize],
         (log2fftsize==7) ? (int16_t *)temp : (int16_t *)&output[(i<<log2fftsize) + ((1+i)*nb_prefix_samples)],
         1);
-    //    write_output("fft_out.m","fftout",temp,(1<<log2fftsize)*2,1,1);
+#else
+    // on AVX2 need 256-bit alignment
+    idft((int16_t *)&input[i<<log2fftsize],
+         (log2fftsize<=9) ? (int16_t *)temp : (int16_t *)&output[(i<<log2fftsize) + ((1+i)*nb_prefix_samples)],
+         1);

-    //memset(temp,0,1<<log2fftsize);
+#endif


    // Copy to frame buffer with Cyclic Extension
@@ -158,12 +164,20 @@ void PHY_ofdm_mod(int *input,                       /// pointer to complex input

      //      msg("Doing cyclic prefix method\n");

-      if (log2fftsize==7) {
+#ifndef __AVX2__
+      if (log2fftsize==7) 
+#else
+      if (log2fftsize<=9) 
+#endif
+      {
        for (j=0; j<((1<<log2fftsize)) ; j++) {
          output_ptr[j] = temp_ptr[j];
        }
      }

+
+
+
      j=(1<<log2fftsize);

      for (k=-1; k>=-nb_prefix_samples; k--) {

--- a/openair1/PHY/MODULATION/slot_fep.c
+++ b/openair1/PHY/MODULATION/slot_fep.c
@@ -56,7 +56,7 @@ int slot_fep(PHY_VARS_UE *phy_vars_ue,
  unsigned int rx_offset;

  void (*dft)(int16_t *,int16_t *, int);
-  int tmp_dft_in[256];  // This is for misalignment issues for 6 and 15 PRBs
+  int tmp_dft_in[2048];  // This is for misalignment issues for 6 and 15 PRBs

  switch (frame_parms->log2_symbol_size) {
  case 7:
@@ -96,12 +96,12 @@ int slot_fep(PHY_VARS_UE *phy_vars_ue,


  if (l<0 || l>=7-frame_parms->Ncp) {
-    msg("slot_fep: l must be between 0 and %d\n",7-frame_parms->Ncp);
+    printf("slot_fep: l must be between 0 and %d\n",7-frame_parms->Ncp);
    return(-1);
  }

  if (Ns<0 || Ns>=20) {
-    msg("slot_fep: Ns must be between 0 and 19\n");
+    printf("slot_fep: Ns must be between 0 and 19\n");
    return(-1);
  }

@@ -111,12 +111,12 @@ int slot_fep(PHY_VARS_UE *phy_vars_ue,
    memset(&ue_common_vars->rxdataF[aa][frame_parms->ofdm_symbol_size*symbol],0,frame_parms->ofdm_symbol_size*sizeof(int));

    rx_offset = sample_offset + slot_offset + nb_prefix_samples0 + subframe_offset - SOFFSET;
-    // Align with 128 bit
-    rx_offset = rx_offset - rx_offset % 4;
+    // Align with 256 bit
+    //    rx_offset = rx_offset&0xfffffff8;

 #ifdef DEBUG_FEP
    //  if (phy_vars_ue->frame <100)
-    msg("slot_fep: frame %d: slot %d, symbol %d, nb_prefix_samples %d, nb_prefix_samples0 %d, slot_offset %d, subframe_offset %d, sample_offset %d,rx_offset %d\n", phy_vars_ue->frame_rx,Ns, symbol,
+    printf("slot_fep: frame %d: slot %d, symbol %d, nb_prefix_samples %d, nb_prefix_samples0 %d, slot_offset %d, subframe_offset %d, sample_offset %d,rx_offset %d\n", phy_vars_ue->frame_rx,Ns, symbol,
        nb_prefix_samples,nb_prefix_samples0,slot_offset,subframe_offset,sample_offset,rx_offset);
 #endif

@@ -127,9 +127,9 @@ int slot_fep(PHY_VARS_UE *phy_vars_ue,
               (short *)&ue_common_vars->rxdata[aa][0],
               frame_parms->ofdm_symbol_size*sizeof(int));

-      if ((rx_offset&3)!=0) {  // if input to dft is not 128-bit aligned, issue for size 6 and 15 PRBs
+      if ((rx_offset&7)!=0) {  // if input to dft is not 256-bit aligned, issue for size 6,15 and 25 PRBs
        memcpy((void *)tmp_dft_in,
-               (void *)&ue_common_vars->rxdata[aa][(rx_offset-nb_prefix_samples0) % frame_length_samples],
+               (void *)&ue_common_vars->rxdata[aa][rx_offset % frame_length_samples],
               frame_parms->ofdm_symbol_size*sizeof(int));
        dft((int16_t *)tmp_dft_in,
            (int16_t *)&ue_common_vars->rxdataF[aa][frame_parms->ofdm_symbol_size*symbol],1);
@@ -142,12 +142,12 @@ int slot_fep(PHY_VARS_UE *phy_vars_ue,

      }
    } else {
-      rx_offset += (frame_parms->ofdm_symbol_size+nb_prefix_samples) +
-                   (frame_parms->ofdm_symbol_size+nb_prefix_samples)*(l-1);
+      rx_offset += (frame_parms->ofdm_symbol_size+nb_prefix_samples)*l;// +
+      //                   (frame_parms->ofdm_symbol_size+nb_prefix_samples)*(l-1);

 #ifdef DEBUG_FEP
      //  if (phy_vars_ue->frame <100)
-      msg("slot_fep: frame %d: slot %d, symbol %d, nb_prefix_samples %d, nb_prefix_samples0 %d, slot_offset %d, subframe_offset %d, sample_offset %d,rx_offset %d\n", phy_vars_ue->frame_rx,Ns, symbol,
+      printf("slot_fep: frame %d: slot %d, symbol %d, nb_prefix_samples %d, nb_prefix_samples0 %d, slot_offset %d, subframe_offset %d, sample_offset %d,rx_offset %d\n", phy_vars_ue->frame_rx,Ns, symbol,
          nb_prefix_samples,nb_prefix_samples0,slot_offset,subframe_offset,sample_offset,rx_offset);
 #endif

@@ -158,7 +158,7 @@ int slot_fep(PHY_VARS_UE *phy_vars_ue,

      start_meas(&phy_vars_ue->rx_dft_stats);

-      if ((rx_offset&3)!=0) {  // if input to dft is not 128-bit aligned, issue for size 6 and 15 PRBs
+      if ((rx_offset&7)!=0) {  // if input to dft is not 128-bit aligned, issue for size 6 and 15 PRBs
        memcpy((void *)tmp_dft_in,
               (void *)&ue_common_vars->rxdata[aa][(rx_offset) % frame_length_samples],
               frame_parms->ofdm_symbol_size*sizeof(int));
@@ -182,7 +182,7 @@ int slot_fep(PHY_VARS_UE *phy_vars_ue,
      for (aa=0; aa<frame_parms->nb_antennas_tx_eNB; aa++) {

 #ifdef DEBUG_FEP
-        msg("Channel estimation eNB %d, aatx %d, slot %d, symbol %d\n",eNB_id,aa,Ns,l);
+        printf("Channel estimation eNB %d, aatx %d, slot %d, symbol %d\n",eNB_id,aa,Ns,l);
 #endif
        start_meas(&phy_vars_ue->dlsch_channel_estimation_stats);
        lte_dl_channel_estimation(phy_vars_ue,eNB_id,0,
@@ -205,7 +205,7 @@ int slot_fep(PHY_VARS_UE *phy_vars_ue,
      // do frequency offset estimation here!
      // use channel estimates from current symbol (=ch_t) and last symbol (ch_{t-1})
 #ifdef DEBUG_FEP
-      msg("Frequency offset estimation\n");
+      printf("Frequency offset estimation\n");
 #endif

      if (l==(4-frame_parms->Ncp)) {
@@ -222,7 +222,7 @@ int slot_fep(PHY_VARS_UE *phy_vars_ue,
  }

 #ifdef DEBUG_FEP
-  msg("slot_fep: done\n");
+  printf("slot_fep: done\n");
 #endif
  return(0);
 }
--- a/openair1/PHY/TOOLS/Makefile
+++ b/openair1/PHY/TOOLS/Makefile
-lte_dfts: lte_dfts.c
-	gcc -O2 -mavx2 -g -ggdb -o lte_dfts lte_dfts.c time_meas.c file_output.c ../../SIMULATION/TOOLS/taus.c -I$$OPENAIR1_DIR -I$$OPENAIR_TARGETS -I$$OPENAIR2_DIR/COMMON -DUSER_MODE -DMR_MAIN -DNB_ANTENNAS_RX=1 # -DD256STATS #-DD64STATS
+lte_dfts_sse4: lte_dfts.c
+	gcc -O2 -msse4.1 -g -ggdb -o lte_dfts_sse4 lte_dfts.c time_meas.c file_output.c ../../SIMULATION/TOOLS/taus.c -I$$OPENAIR1_DIR -I$$OPENAIR_TARGETS -I$$OPENAIR2_DIR/COMMON -DUSER_MODE -DMR_MAIN -DNB_ANTENNAS_RX=1 # -DD256STATS #-DD64STATS

-lte_dfts.s: lte_dfts.c
+lte_dfts_avx2: lte_dfts.c
+	gcc -O2 -mavx2 -g -ggdb -o lte_dfts_avx2 lte_dfts.c time_meas.c file_output.c ../../SIMULATION/TOOLS/taus.c -I$$OPENAIR1_DIR -I$$OPENAIR_TARGETS -I$$OPENAIR2_DIR/COMMON -DUSER_MODE -DMR_MAIN -DNB_ANTENNAS_RX=1 # -DD256STATS #-DD64STATS
+
+lte_dfts_avx2.s: lte_dfts.c
 	gcc -O2 -mavx2 -S lte_dfts.c time_meas.c file_output.c ../../SIMULATION/TOOLS/taus.c -I$$OPENAIR1_DIR -I$$OPENAIR_TARGETS -I$$OPENAIR2_DIR/COMMON -DUSER_MODE -DMR_MAIN -DNB_ANTENNAS_RX=1 # -DD256STATS #-DD64STATS

-dft_cycles: lte_dfts
-	./lte_dfts | egrep cycles
+lte_dfts_sse4.s: lte_dfts.c
+	gcc -O2 -msse4.1 -S lte_dfts.c time_meas.c file_output.c ../../SIMULATION/TOOLS/taus.c -I$$OPENAIR1_DIR -I$$OPENAIR_TARGETS -I$$OPENAIR2_DIR/COMMON -DUSER_MODE -DMR_MAIN -DNB_ANTENNAS_RX=1 # -DD256STATS #-DD64STATS
+
+dft_cycles_avx2: lte_dfts_avx2
+	./lte_dfts_avx2 | egrep cycles
--- a/openair1/PHY/TOOLS/lte_dfts.c
+++ b/openair1/PHY/TOOLS/lte_dfts.c
--- a/openair1/PHY/TOOLS/twiddles8192.h
+++ b/openair1/PHY/TOOLS/twiddles8192.h
--- a/openair1/SIMULATION/LTE_PHY/dlsim.c
+++ b/openair1/SIMULATION/LTE_PHY/dlsim.c
@@ -3336,7 +3336,7 @@ PMI_FEEDBACK:
            PHY_vars_UE->dlsch_ue[0][cw]->harq_processes[PHY_vars_UE->dlsch_ue[0][cw]->current_harq_pid]->G = coded_bits_per_codeword;


-            /*
+	                
            // calculate uncoded BLER
            uncoded_ber=0;
            for (i=0;i<coded_bits_per_codeword;i++)
@@ -3352,7 +3352,7 @@ PMI_FEEDBACK:

            if (n_frames==1)
              write_output("uncoded_ber_bit.m","uncoded_ber_bit",uncoded_ber_bit,coded_bits_per_codeword,1,0);
-            */
+            

            start_meas(&PHY_vars_UE->dlsch_unscrambling_stats);
            dlsch_unscrambling(&PHY_vars_UE->lte_frame_parms,
@@ -3463,10 +3463,10 @@ PMI_FEEDBACK:
              }

              sprintf(fname,"rxsig0_r%d.m",round);
-              sprintf(vname,"rxs0_r%d.m",round);
+              sprintf(vname,"rxs0_r%d",round);
              write_output(fname,vname, &PHY_vars_UE->lte_ue_common_vars.rxdata[0][0],10*PHY_vars_UE->lte_frame_parms.samples_per_tti,1,1);
              sprintf(fname,"rxsigF0_r%d.m",round);
-              sprintf(vname,"rxs0F_r%d.m",round);
+              sprintf(vname,"rxs0F_r%d",round);
              write_output(fname,vname, &PHY_vars_UE->lte_ue_common_vars.rxdataF[0][0],2*PHY_vars_UE->lte_frame_parms.ofdm_symbol_size*nsymb,2,1);
 	     
              if (PHY_vars_UE->lte_frame_parms.nb_antennas_rx>1) {
@@ -3479,14 +3479,14 @@ PMI_FEEDBACK:
              }

              sprintf(fname,"dlsch00_r%d.m",round);
-              sprintf(vname,"dl00_r%d.m",round);
+              sprintf(vname,"dl00_r%d",round);
              write_output(fname,vname,
                           &(PHY_vars_UE->lte_ue_common_vars.dl_ch_estimates[eNB_id][0][0]),
                           PHY_vars_UE->lte_frame_parms.ofdm_symbol_size*nsymb,1,1);

              if (PHY_vars_UE->lte_frame_parms.nb_antennas_rx>1) {
                sprintf(fname,"dlsch01_r%d.m",round);
-                sprintf(vname,"dl01_r%d.m",round);
+                sprintf(vname,"dl01_r%d",round);
                write_output(fname,vname,
                             &(PHY_vars_UE->lte_ue_common_vars.dl_ch_estimates[eNB_id][1][0]),
                             PHY_vars_UE->lte_frame_parms.ofdm_symbol_size*nsymb/2,1,1);
@@ -3494,7 +3494,7 @@ PMI_FEEDBACK:

              if (PHY_vars_eNB->lte_frame_parms.nb_antennas_tx>1) {
                sprintf(fname,"dlsch10_r%d.m",round);
-                sprintf(vname,"dl10_r%d.m",round);
+                sprintf(vname,"dl10_r%d",round);
                write_output(fname,vname,
                             &(PHY_vars_UE->lte_ue_common_vars.dl_ch_estimates[eNB_id][2][0]),
                             PHY_vars_UE->lte_frame_parms.ofdm_symbol_size*nsymb/2,1,1);
@@ -3502,7 +3502,7 @@ PMI_FEEDBACK:

              if ((PHY_vars_UE->lte_frame_parms.nb_antennas_rx>1) && (PHY_vars_eNB->lte_frame_parms.nb_antennas_tx>1)) {
                sprintf(fname,"dlsch11_r%d.m",round);
-                sprintf(vname,"dl11_r%d.m",round);
+                sprintf(vname,"dl11_r%d",round);
                write_output(fname,vname,
                             &(PHY_vars_UE->lte_ue_common_vars.dl_ch_estimates[eNB_id][3][0]),
                             PHY_vars_UE->lte_frame_parms.ofdm_symbol_size*nsymb/2,1,1);