corrections for ARM NEON port. IFFT/FFTs are validated (512/1024/2048), 7.5...

corrections for ARM NEON port. IFFT/FFTs are validated (512/1024/2048), 7.5 kHz frequency-shift for eNodeB RX, UE TX validated. UL channel estimation ok for DMRS symbols, interpolation is broken on NEON.

corrections for ARM NEON port. IFFT/FFTs are validated (512/1024/2048), 7.5...
corrections for ARM NEON port. IFFT/FFTs are validated (512/1024/2048), 7.5 kHz frequency-shift for eNodeB RX, UE TX validated. UL channel estimation ok for DMRS symbols, interpolation is broken on NEON.
562ce288 · Raymond Knopp · 48b95afc · 562ce288 · 562ce288 · 562ce288
Commit 562ce288 authored May 20, 2018 by Raymond Knopp
7 changed files
--- a/openair1/PHY/LTE_ESTIMATION/lte_ul_channel_estimation.c
+++ b/openair1/PHY/LTE_ESTIMATION/lte_ul_channel_estimation.c
@@ -173,6 +173,11 @@ int32_t temp_in_ifft_0[2048*2] __attribute__((aligned(32)));
        mmtmpU3 = _mm_unpackhi_epi32(mmtmpU0,mmtmpU1);

        ul_ch128[2] = _mm_packs_epi32(mmtmpU2,mmtmpU3);
+
+        ul_ch128+=3;
+        ul_ref128+=3;
+        rxdataF128+=3;
+
 #elif defined(__arm__) || defined(__aarch64__)
      mmtmp0 = vmull_s16(((int16x4_t*)ul_ref128)[0],((int16x4_t*)rxdataF128)[0]);
      mmtmp1 = vmull_s16(((int16x4_t*)ul_ref128)[1],((int16x4_t*)rxdataF128)[1]);
@@ -183,7 +188,7 @@ int32_t temp_in_ifft_0[2048*2] __attribute__((aligned(32)));
      mmtmp_im = vcombine_s32(vpadd_s32(vget_low_s32(mmtmp0),vget_high_s32(mmtmp0)),
                              vpadd_s32(vget_low_s32(mmtmp1),vget_high_s32(mmtmp1)));

-      ul_ch128[0] = vcombine_s16(vmovn_s32(mmtmp_re),vmovn_s32(mmtmp_im));
+      ((int16x4x2_t*)ul_ch128)[0] = vzip_s16(vshrn_n_s32(mmtmp_re,15),vshrn_n_s32(mmtmp_im,15));
      ul_ch128++;
      ul_ref128++;
      rxdataF128++;
@@ -196,7 +201,7 @@ int32_t temp_in_ifft_0[2048*2] __attribute__((aligned(32)));
      mmtmp_im = vcombine_s32(vpadd_s32(vget_low_s32(mmtmp0),vget_high_s32(mmtmp0)),
                              vpadd_s32(vget_low_s32(mmtmp1),vget_high_s32(mmtmp1)));

-      ul_ch128[0] = vcombine_s16(vmovn_s32(mmtmp_re),vmovn_s32(mmtmp_im));
+      ((int16x4x2_t*)ul_ch128)[0] = vzip_s16(vshrn_n_s32(mmtmp_re,15),vshrn_n_s32(mmtmp_im,15));
      ul_ch128++;
      ul_ref128++;
      rxdataF128++;
@@ -210,16 +215,13 @@ int32_t temp_in_ifft_0[2048*2] __attribute__((aligned(32)));
      mmtmp_im = vcombine_s32(vpadd_s32(vget_low_s32(mmtmp0),vget_high_s32(mmtmp0)),
                              vpadd_s32(vget_low_s32(mmtmp1),vget_high_s32(mmtmp1)));

-      ul_ch128[0] = vcombine_s16(vmovn_s32(mmtmp_re),vmovn_s32(mmtmp_im));
+      ((int16x4x2_t*)ul_ch128)[0] = vzip_s16(vshrn_n_s32(mmtmp_re,15),vshrn_n_s32(mmtmp_im,15));
      ul_ch128++;
      ul_ref128++;
      rxdataF128++;


 #endif
-        ul_ch128+=3;
-        ul_ref128+=3;
-        rxdataF128+=3;
      }

      alpha_ind = 0;

--- a/openair1/PHY/LTE_TRANSPORT/dlsch_demodulation.c
+++ b/openair1/PHY/LTE_TRANSPORT/dlsch_demodulation.c
--- a/openair1/PHY/MODULATION/slot_fep.c
+++ b/openair1/PHY/MODULATION/slot_fep.c
@@ -21,7 +21,7 @@

 #include "PHY/defs.h"
 #include "defs.h"
-//#define DEBUG_FEP
+#define DEBUG_FEP

 #define SOFFSET 0

@@ -240,6 +240,7 @@ int slot_fep(PHY_VARS_UE *ue,
 #ifdef DEBUG_FEP
  printf("slot_fep: done\n");
 #endif
+
  return(0);
 }


--- a/openair1/PHY/MODULATION/ul_7_5_kHz.c
+++ b/openair1/PHY/MODULATION/ul_7_5_kHz.c
@@ -30,6 +30,9 @@ short conjugate75[8]__attribute__((aligned(16))) = {-1,1,-1,1,-1,1,-1,1} ;
 short conjugate75_2[8]__attribute__((aligned(16))) = {1,-1,1,-1,1,-1,1,-1} ;
 short negate[8]__attribute__((aligned(16))) = {-1,-1,-1,-1,-1,-1,-1,-1};

+#define print_shorts(s,x) printf("%s : %d %d %d %d %d %d %d %d\n",s,((int16_t*)x)[0],((int16_t*)x)[1],((int16_t*)x)[2],((int16_t*)x)[3],((int16_t*)x)[4],((int16_t*)x)[5],((int16_t*)x)[6],((int16_t*)x)[7])
+#define print_dw(s,x) printf("%s : %d %d %d %d\n",s,((int32_t*)x)[0],((int32_t*)x)[1],((int32_t*)x)[2],((int32_t*)x)[3])
+
 void apply_7_5_kHz(PHY_VARS_UE *ue,int32_t*txdata,uint8_t slot)
 {

@@ -90,7 +93,6 @@ void apply_7_5_kHz(PHY_VARS_UE *ue,int32_t*txdata,uint8_t slot)
  kHz7_5ptr128 = (int16x8_t*)kHz7_5ptr;
 #endif
  // apply 7.5 kHz
-
  for (i=0; i<(len>>2); i++) {
 #if defined(__x86_64__) || defined(__i386__)
    mmtmp_re = _mm_madd_epi16(*txptr128,*kHz7_5ptr128);
@@ -117,15 +119,34 @@ void apply_7_5_kHz(PHY_VARS_UE *ue,int32_t*txdata,uint8_t slot)
                            vpadd_s32(vget_low_s32(mmtmp1),vget_high_s32(mmtmp1)));
        //mmtmp_re = [Re(ch[0])Re(rx[0])+Im(ch[0])Im(ch[0]) Re(ch[1])Re(rx[1])+Im(ch[1])Im(ch[1]) Re(ch[2])Re(rx[2])+Im(ch[2])Im(ch[2]) Re(ch[3])Re(rx[3])+Im(ch[3])Im(ch[3])] 

-    mmtmp0 = vmull_s16(vrev32_s16(vmul_s16(((int16x4_t*)txptr128)[0],*(int16x4_t*)conjugate75_2)),((int16x4_t*)kHz7_5ptr128)[0]);
+    /*if (i<4) {   
+       print_shorts("txp:",txptr128);
+       print_shorts("kHz75:",kHz7_5ptr128);
+       print_dw("mmtmp0",&mmtmp0);
+       print_dw("mmtmp1",&mmtmp1); 
+    }*/
+    mmtmp0 = vmull_s16(((int16x4_t*)txptr128)[0],
+                       vrev32_s16(vmul_s16(((int16x4_t*)kHz7_5ptr128)[0],
+                                           *(int16x4_t*)conjugate75_2)));
        //mmtmp0 = [-Im(ch[0])Re(rx[0]) Re(ch[0])Im(rx[0]) -Im(ch[1])Re(rx[1]) Re(ch[1])Im(rx[1])]
-    mmtmp1 = vmull_s16(vrev32_s16(vmul_s16(((int16x4_t*)txptr128)[1],*(int16x4_t*)conjugate75_2)), ((int16x4_t*)kHz7_5ptr128)[1]);
+    mmtmp1 = vmull_s16(((int16x4_t*)txptr128)[1],
+                       vrev32_s16(vmul_s16(((int16x4_t*)kHz7_5ptr128)[1],
+                                           *(int16x4_t*)conjugate75_2)));
        //mmtmp0 = [-Im(ch[2])Re(rx[2]) Re(ch[2])Im(rx[2]) -Im(ch[3])Re(rx[3]) Re(ch[3])Im(rx[3])]
+
+
+
+
    mmtmp_im = vcombine_s32(vpadd_s32(vget_low_s32(mmtmp0),vget_high_s32(mmtmp0)),
                            vpadd_s32(vget_low_s32(mmtmp1),vget_high_s32(mmtmp1)));
        //mmtmp_im = [-Im(ch[0])Re(rx[0])+Re(ch[0])Im(rx[0]) -Im(ch[1])Re(rx[1])+Re(ch[1])Im(rx[1]) -Im(ch[2])Re(rx[2])+Re(ch[2])Im(rx[2]) -Im(ch[3])Re(rx[3])+Re(ch[3])Im(rx[3])]

-    txptr128[0] = vcombine_s16(vmovn_s32(mmtmp_re),vmovn_s32(mmtmp_im));
+    ((int16x4x2_t*)txptr128)[0] = vzip_s16(vshrn_n_s32(mmtmp_re,15),vshrn_n_s32(mmtmp_im,15));
+    /*if (i<4) {
+       print_dw("mmtmp_re",&mmtmp_re);
+       print_dw("mmtmp_im",&mmtmp_im);
+       print_shorts("txp",txptr128);
+    }*/
    txptr128++;
    kHz7_5ptr128++;
 #endif
@@ -238,15 +259,21 @@ void remove_7_5_kHz(RU_t *ru,uint8_t slot)
                              vpadd_s32(vget_low_s32(mmtmp1),vget_high_s32(mmtmp1)));
        //mmtmp_re = [Re(ch[0])Re(rx[0])+Im(ch[0])Im(ch[0]) Re(ch[1])Re(rx[1])+Im(ch[1])Im(ch[1]) Re(ch[2])Re(rx[2])+Im(ch[2])Im(ch[2]) Re(ch[3])Re(rx[3])+Im(ch[3])Im(ch[3])]

-      mmtmp0 = vmull_s16(vrev32_s16(vmul_s16(((int16x4_t*)rxptr128)[0],*(int16x4_t*)conjugate75_2)), ((int16x4_t*)kHz7_5ptr128)[0]);
+      mmtmp0 = vmull_s16(((int16x4_t*)rxptr128)[0],
+                         vrev32_s16(vmul_s16(((int16x4_t*)kHz7_5ptr128)[0],
+                                           *(int16x4_t*)conjugate75_2)));
        //mmtmp0 = [-Im(ch[0])Re(rx[0]) Re(ch[0])Im(rx[0]) -Im(ch[1])Re(rx[1]) Re(ch[1])Im(rx[1])]
-      mmtmp1 = vmull_s16(vrev32_s16(vmul_s16(((int16x4_t*)rxptr128)[1],*(int16x4_t*)conjugate75_2)), ((int16x4_t*)kHz7_5ptr128)[1]);
+      mmtmp1 = vmull_s16(((int16x4_t*)rxptr128)[1],
+                       vrev32_s16(vmul_s16(((int16x4_t*)kHz7_5ptr128)[1],
+                                           *(int16x4_t*)conjugate75_2)));
+
        //mmtmp1 = [-Im(ch[2])Re(rx[2]) Re(ch[2])Im(rx[2]) -Im(ch[3])Re(rx[3]) Re(ch[3])Im(rx[3])]
      mmtmp_im = vcombine_s32(vpadd_s32(vget_low_s32(mmtmp0),vget_high_s32(mmtmp0)),
                              vpadd_s32(vget_low_s32(mmtmp1),vget_high_s32(mmtmp1)));
        //mmtmp_im = [-Im(ch[0])Re(rx[0])+Re(ch[0])Im(rx[0]) -Im(ch[1])Re(rx[1])+Re(ch[1])Im(rx[1]) -Im(ch[2])Re(rx[2])+Re(ch[2])Im(rx[2]) -Im(ch[3])Re(rx[3])+Re(ch[3])Im(rx[3])]

-      rxptr128_7_5kHz[0] = vcombine_s16(vmovn_s32(mmtmp_re),vmovn_s32(mmtmp_im));
+    ((int16x4x2_t*)rxptr128_7_5kHz)[0] = vzip_s16(vshrn_n_s32(mmtmp_re,15),vshrn_n_s32(mmtmp_im,15));
+
      rxptr128_7_5kHz++;
      rxptr128++;
      kHz7_5ptr128++;

--- a/openair1/PHY/TOOLS/lte_dfts.c
+++ b/openair1/PHY/TOOLS/lte_dfts.c
@@ -684,9 +684,12 @@ static inline void bfly2_16(int16x8_t *x0, int16x8_t *x1, int16x8_t *y0, int16x8

 static inline void bfly2_16(int16x8_t *x0, int16x8_t *x1, int16x8_t *y0, int16x8_t *y1, int16x8_t *tw, int16x8_t *twb)
 {
+  int16x8_t x1t;

-  *y0  = vqaddq_s16(*x0,*x1);
-  *y1  = vqsubq_s16(*x0,*x1);
+  x1t  = packed_cmult2(*(x1),*(tw),*(twb));
+
+  *y0  = vqaddq_s16(*x0,x1t);
+  *y1  = vqsubq_s16(*x0,x1t);

 }
 #endif
@@ -18918,7 +18921,7 @@ int main(int argc, char**argv)
      ((int16_t*)x)[i] = (int16_t)((taus()&0xffff))>>5;
    }
    memset((void*)&y[0],0,16*4);
-    idft16((int16_t *)x,(int16_t *)y);
+    dft16((int16_t *)x,(int16_t *)y);
    printf("\n\n16-point\n");
    printf("X: ");
    for (i=0;i<4;i++)
@@ -19105,7 +19108,7 @@ int main(int argc, char**argv)
  reset_meas(&ts);
  for (i=0; i<10000; i++) {
    start_meas(&ts);
-    idft512((int16_t *)x,(int16_t *)y,1);
+    dft512((int16_t *)x,(int16_t *)y,1);
    stop_meas(&ts);
  }

@@ -19140,7 +19143,7 @@ int main(int argc, char**argv)

  for (i=0; i<10000; i++) {
    start_meas(&ts);
-    idft1024((int16_t *)x,(int16_t *)y,1);
+    dft1024((int16_t *)x,(int16_t *)y,1);
    stop_meas(&ts);
  }

@@ -19165,7 +19168,7 @@ int main(int argc, char**argv)

  for (i=0; i<10000; i++) {
    start_meas(&ts);
-    idft2048((int16_t *)x,(int16_t *)y,1);
+    dft2048((int16_t *)x,(int16_t *)y,1);
    stop_meas(&ts);
  }

--- a/openair1/SCHED/phy_procedures_lte_ue.c
+++ b/openair1/SCHED/phy_procedures_lte_ue.c
@@ -1308,6 +1308,7 @@ void ulsch_common_procedures(PHY_VARS_UE *ue, UE_rxtx_proc_t *proc, uint8_t empt
 			&ue->frame_parms);
      }

+ 
 #if defined(EXMIMO) || defined(OAI_USRP) || defined(OAI_BLADERF) || defined(OAI_LMSSDR)
    apply_7_5_kHz(ue,dummy_tx_buffer,0);
    apply_7_5_kHz(ue,dummy_tx_buffer,1);

--- a/openair1/SIMULATION/LTE_PHY/dlsim.c
+++ b/openair1/SIMULATION/LTE_PHY/dlsim.c
@@ -1667,9 +1667,9 @@ int main(int argc, char **argv)
              write_output("txsig0.m","txs0", &ru->common.txdata[0][subframe* eNB->frame_parms.samples_per_tti], eNB->frame_parms.samples_per_tti,1,1);

              if (transmission_mode<7) {
-	        write_output("txsigF0.m","txsF0x", &ru->common.txdataF_BF[0][subframe*nsymb*eNB->frame_parms.ofdm_symbol_size],nsymb*eNB->frame_parms.ofdm_symbol_size,1,1);
+	        write_output("txsigF0.m","txsF0x", &ru->common.txdataF_BF[0][0],nsymb*eNB->frame_parms.ofdm_symbol_size,1,1);
              } else if (transmission_mode == 7) {
-                write_output("txsigF0.m","txsF0", &ru->common.txdataF_BF[5][subframe*nsymb*eNB->frame_parms.ofdm_symbol_size],nsymb*eNB->frame_parms.ofdm_symbol_size,1,1);
+                write_output("txsigF0.m","txsF0", &ru->common.txdataF_BF[5][0],nsymb*eNB->frame_parms.ofdm_symbol_size,1,1);
                write_output("txsigF0_BF.m","txsF0_BF", &ru->common.txdataF_BF[0][0],eNB->frame_parms.ofdm_symbol_size,1,1);
              }
            }
@@ -1730,6 +1730,7 @@ int main(int argc, char **argv)

 	  phy_procedures_UE_RX(UE,proc,0,0,dci_flag,normal_txrx,no_relay,NULL);

+
 	  dci_received = dci_received - UE->pdcch_vars[UE->current_thread_id[proc->subframe_rx]][eNB_id]->dci_received;

 	  if (dci_flag && (dci_received == 0)) {