From ba55fcac27fb34dad97beea1bcd609a6dfa8a88d Mon Sep 17 00:00:00 2001
From: hbilel <haithem.bilel@alcatelonetouch.com>
Date: Mon, 6 Mar 2017 12:13:20 +0100
Subject: [PATCH]   [OAI-UE] Mimo LLR computation AVX2 + fix in TM3 deprecoding

---
 cmake_targets/CMakeLists.txt                  |  1 +
 .../PHY/LTE_TRANSPORT/dlsch_demodulation.c    | 23 ++++++++------
 .../PHY/LTE_TRANSPORT/dlsch_llr_computation.c | 31 ++++++++++++++++++-
 openair1/PHY/LTE_TRANSPORT/proto.h            | 16 ++++++++++
 openair1/PHY/Makefile.inc                     |  1 +
 openair3/NAS/UE/ESM/esm_ebr_context.c         |  2 +-
 targets/RT/USER/lte-softmodem.c               |  8 ++++-
 7 files changed, 70 insertions(+), 12 deletions(-)

diff --git a/cmake_targets/CMakeLists.txt b/cmake_targets/CMakeLists.txt
index e4d3acf4b1..0bf63cfb1a 100644
--- a/cmake_targets/CMakeLists.txt
+++ b/cmake_targets/CMakeLists.txt
@@ -1034,6 +1034,7 @@ set(PHY_SRC
   ${OPENAIR1_DIR}/PHY/LTE_TRANSPORT/dlsch_modulation.c
   ${OPENAIR1_DIR}/PHY/LTE_TRANSPORT/dlsch_demodulation.c
   ${OPENAIR1_DIR}/PHY/LTE_TRANSPORT/dlsch_llr_computation.c
+  ${OPENAIR1_DIR}/PHY/LTE_TRANSPORT/dlsch_llr_computation_avx2.c
   ${OPENAIR1_DIR}/PHY/LTE_TRANSPORT/power_control.c
   ${OPENAIR1_DIR}/PHY/LTE_TRANSPORT/dlsch_decoding.c
   ${OPENAIR1_DIR}/PHY/LTE_TRANSPORT/dlsch_scrambling.c
diff --git a/openair1/PHY/LTE_TRANSPORT/dlsch_demodulation.c b/openair1/PHY/LTE_TRANSPORT/dlsch_demodulation.c
index 9ae212cd27..7b1fb3cd50 100644
--- a/openair1/PHY/LTE_TRANSPORT/dlsch_demodulation.c
+++ b/openair1/PHY/LTE_TRANSPORT/dlsch_demodulation.c
@@ -42,6 +42,7 @@
 #define NOCYGWIN_STATIC
 #endif
 
+extern int16_t dlsch_demod_shift;
 //#define DEBUG_HARQ
 
 //#undef LOG_D
@@ -402,8 +403,8 @@ int rx_pdsch(PHY_VARS_UE *ue,
       LOG_D(PHY,"Channel Level TM34  avg_0 %d, avg_1 %d, rx_type %d, rx_standard %d, interf_unaw_shift %d \n", avg_0[0],
               avg_1[0], rx_type, rx_standard, interf_unaw_shift);
         if (rx_type>rx_standard) {
-          avg_0[0] = (log2_approx(avg_0[0])/2) - 5 + 2 ;//+ 2;
-          avg_1[0] = (log2_approx(avg_1[0])/2) - 5 + 2 ;//+ 2;
+          avg_0[0] = (log2_approx(avg_0[0])/2) + dlsch_demod_shift;// + 2 ;//+ 4;
+          avg_1[0] = (log2_approx(avg_1[0])/2) + dlsch_demod_shift;// + 2 ;//+ 4;
           pdsch_vars[eNB_id]->log2_maxh0 = cmax(avg_0[0],0);
           pdsch_vars[eNB_id]->log2_maxh1 = cmax(avg_1[0],0);
           //printf("TM4 I-A log2_maxh0 = %d\n", pdsch_vars[eNB_id]->log2_maxh0);
@@ -1067,9 +1068,9 @@ int rx_pdsch(PHY_VARS_UE *ue,
       write_output("dl_ch_estimates_ext10.m", "dl_ch_estimates_ext10", &pdsch_vars[eNB_id]->dl_ch_estimates_ext[2][0],14*frame_parms->N_RB_DL*12,1,1);
       write_output("dl_ch_estimates_ext11.m", "dl_ch_estimates_ext11", &pdsch_vars[eNB_id]->dl_ch_estimates_ext[3][0],14*frame_parms->N_RB_DL*12,1,1);
       write_output("rxdataF_comp00.m","rxdataF_comp00",              &pdsch_vars[eNB_id]->rxdataF_comp0[0][0],14*frame_parms->N_RB_DL*12,1,1);
-      write_output("rxdataF_comp01.m","rxdataF_comp01",              &pdsch_vars[eNB_id]->rxdataF_comp0[0][0],14*frame_parms->N_RB_DL*12,1,1);
-      write_output("rxdataF_comp10.m","rxdataF_comp10",              &pdsch_vars[eNB_id]->rxdataF_comp0[0][0],14*frame_parms->N_RB_DL*12,1,1);
-      write_output("rxdataF_comp11.m","rxdataF_comp11",              &pdsch_vars[eNB_id]->rxdataF_comp0[0][0],14*frame_parms->N_RB_DL*12,1,1);
+      write_output("rxdataF_comp01.m","rxdataF_comp01",              &pdsch_vars[eNB_id]->rxdataF_comp0[1][0],14*frame_parms->N_RB_DL*12,1,1);
+      write_output("rxdataF_comp10.m","rxdataF_comp10",              &pdsch_vars[eNB_id]->rxdataF_comp1[harq_pid][round][0][0],14*frame_parms->N_RB_DL*12,1,1);
+      write_output("rxdataF_comp11.m","rxdataF_comp11",              &pdsch_vars[eNB_id]->rxdataF_comp1[harq_pid][round][1][0],14*frame_parms->N_RB_DL*12,1,1);
 #endif
       write_output("llr0.m","llr0",  &pdsch_vars[eNB_id]->llr[0][0],(14*nb_rb*12*dlsch1_harq->Qm) - 4*(nb_rb*4*dlsch1_harq->Qm),1,0);
       write_output("llr1.m","llr1",  &pdsch_vars[eNB_id]->llr[1][0],(14*nb_rb*12*dlsch1_harq->Qm) - 4*(nb_rb*4*dlsch1_harq->Qm),1,0);
@@ -1666,9 +1667,8 @@ void prec2A_TM3_128(__m128i *ch0,__m128i *ch1) {
 
   __m128i tmp0,tmp1;
 
-  // sqrt(2) is already taken into account in computation sqrt_rho_a, sqrt_rho_b,
-  //so divide by 2 is replaced by divide by sqrt(2).
 
+//_mm_mulhi_epi16
   //  print_shorts("prec2A_TM3 ch0 (before):",ch0);
   //  print_shorts("prec2A_TM3 ch1 (before):",ch1);
 
@@ -1679,6 +1679,11 @@ void prec2A_TM3_128(__m128i *ch0,__m128i *ch1) {
   ch0[0] = _mm_adds_epi16(ch0[0],tmp1);
   ch1[0] = _mm_subs_epi16(tmp0,tmp1);
 
+  ch0[0] = _mm_mulhi_epi16(ch0[0],amp);
+  ch0[0] = _mm_slli_epi16(ch0[0],1);
+
+  ch1[0] = _mm_mulhi_epi16(ch1[0],amp);
+  ch1[0] = _mm_slli_epi16(ch1[0],1);
 
   //  print_shorts("prec2A_TM3 ch0 (mid):",&tmp0);
   //  print_shorts("prec2A_TM3 ch1 (mid):",ch1);
@@ -1688,8 +1693,8 @@ void prec2A_TM3_128(__m128i *ch0,__m128i *ch1) {
   ch1[0] = _mm_mulhi_epi16(ch1[0],amp);
   ch1[0] = _mm_slli_epi16(ch1[0],1);
 
-  // ch0[0] = _mm_srai_epi16(ch0[0],1);
-  // ch1[0] = _mm_srai_epi16(ch1[0],1);
+  //ch0[0] = _mm_srai_epi16(ch0[0],1);
+  //ch1[0] = _mm_srai_epi16(ch1[0],1);
 
   //  print_shorts("prec2A_TM3 ch0 (after):",ch0);
   //  print_shorts("prec2A_TM3 ch1 (after):",ch1);
diff --git a/openair1/PHY/LTE_TRANSPORT/dlsch_llr_computation.c b/openair1/PHY/LTE_TRANSPORT/dlsch_llr_computation.c
index 0b73e2b023..c33dec87d3 100644
--- a/openair1/PHY/LTE_TRANSPORT/dlsch_llr_computation.c
+++ b/openair1/PHY/LTE_TRANSPORT/dlsch_llr_computation.c
@@ -8831,6 +8831,7 @@ int dlsch_64qam_64qam_llr(LTE_DL_FRAME_PARMS *frame_parms,
     len = (nb_rb*12) - pbch_pss_sss_adjust;
   }
 
+#if 0
   qam64_qam64((short *)rxF,
               (short *)rxF_i,
               (short *)ch_mag,
@@ -8838,7 +8839,35 @@ int dlsch_64qam_64qam_llr(LTE_DL_FRAME_PARMS *frame_parms,
               (short *)llr16,
               (short *)rho,
               len);
-
+#else
+  // Round length up to multiple of 16 words
+  uint32_t len256i = ((len+16)>>4)*16;
+  int32_t *rxF_256i      = (int32_t*) malloc16_clear(len256i*4);
+  int32_t *rxF_i_256i    = (int32_t*) malloc16_clear(len256i*4);
+  int32_t *ch_mag_256i   = (int32_t*) malloc16_clear(len256i*4);
+  int32_t *ch_mag_i_256i = (int32_t*) malloc16_clear(len256i*4);
+  int32_t *rho_256i      = (int32_t*) malloc16_clear(len256i*4);
+
+  memcpy(rxF_256i, rxF, len*4);
+  memcpy(rxF_i_256i, rxF_i, len*4);
+  memcpy(ch_mag_256i, ch_mag, len*4);
+  memcpy(ch_mag_i_256i, ch_mag_i, len*4);
+  memcpy(rho_256i, rho, len*4);
+  
+  qam64_qam64_avx2((int32_t *)rxF_256i,
+                   (int32_t *)rxF_i_256i,
+                   (int32_t *)ch_mag_256i,
+                   (int32_t *)ch_mag_i_256i,
+                   (int16_t *)llr16,
+                   (int32_t *) rho_256i,
+                   len);
+
+  free16(rxF_256i, sizeof(rxF_256i));
+  free16(rxF_i_256i, sizeof(rxF_i_256i));
+  free16(ch_mag_256i, sizeof(ch_mag_256i));
+  free16(ch_mag_i_256i, sizeof(ch_mag_i_256i));
+  free16(rho_256i, sizeof(rho_256i));
+#endif
   llr16 += (6*len);
   *llr16p = (short *)llr16;
   return(0);
diff --git a/openair1/PHY/LTE_TRANSPORT/proto.h b/openair1/PHY/LTE_TRANSPORT/proto.h
index 5fc8dea986..a5aa145e17 100644
--- a/openair1/PHY/LTE_TRANSPORT/proto.h
+++ b/openair1/PHY/LTE_TRANSPORT/proto.h
@@ -746,6 +746,22 @@ void qam64_qam64(short *stream0_in,
                  short *rho01,
                  int length);
 
+/** \brief This function computes the LLRs for ML (max-logsum approximation) dual-stream 64QAM/64QAM reception.
+    @param stream0_in Input from channel compensated (MR combined) stream 0
+    @param stream1_in Input from channel compensated (MR combined) stream 1
+    @param ch_mag   Input from scaled channel magnitude square of h0'*g0
+    @param ch_mag_i Input from scaled channel magnitude square of h0'*g1
+    @param stream0_out Output from LLR unit for stream0
+    @param rho01 Cross-correlation between channels (MR combined)
+    @param length in complex channel outputs*/
+void qam64_qam64_avx2(int32_t *stream0_in,
+                      int32_t *stream1_in,
+                      int32_t *ch_mag,
+                      int32_t *ch_mag_i,
+                      int16_t *stream0_out,
+                      int32_t *rho01,
+                      int length);
+
 /** \brief This function perform LLR computation for dual-stream (64QAM/64QAM) transmission.
     @param frame_parms Frame descriptor structure
     @param rxdataF_comp Compensated channel output
diff --git a/openair1/PHY/Makefile.inc b/openair1/PHY/Makefile.inc
index 1586f353c0..90094b31f8 100644
--- a/openair1/PHY/Makefile.inc
+++ b/openair1/PHY/Makefile.inc
@@ -7,6 +7,7 @@ PHY_OBJS += $(TOP_DIR)/PHY/LTE_TRANSPORT/dlsch_coding.o
 PHY_OBJS += $(TOP_DIR)/PHY/LTE_TRANSPORT/dlsch_modulation.o
 PHY_OBJS += $(TOP_DIR)/PHY/LTE_TRANSPORT/dlsch_demodulation.o
 PHY_OBJS += $(TOP_DIR)/PHY/LTE_TRANSPORT/dlsch_llr_computation.o
+PHY_OBJS += $(TOP_DIR)/PHY/LTE_TRANSPORT/dlsch_llr_computation_avx2.o
 PHY_OBJS += $(TOP_DIR)/PHY/LTE_TRANSPORT/power_control.o
 PHY_OBJS += $(TOP_DIR)/PHY/LTE_TRANSPORT/dlsch_decoding.o
 PHY_OBJS += $(TOP_DIR)/PHY/LTE_TRANSPORT/dlsch_scrambling.o
diff --git a/openair3/NAS/UE/ESM/esm_ebr_context.c b/openair3/NAS/UE/ESM/esm_ebr_context.c
index bfa4a04cc8..31f215e359 100644
--- a/openair3/NAS/UE/ESM/esm_ebr_context.c
+++ b/openair3/NAS/UE/ESM/esm_ebr_context.c
@@ -286,7 +286,7 @@ int esm_ebr_context_create(
              LOG_TRACE(INFO, "ESM-PROC  - executing %s ",
                        command_line);
 
-             if (system(command_line)) ; /* TODO: what to do? */
+             //if (system(command_line)) ; /* TODO: what to do? */
 
              break;
 
diff --git a/targets/RT/USER/lte-softmodem.c b/targets/RT/USER/lte-softmodem.c
index 8303d88218..5761577259 100644
--- a/targets/RT/USER/lte-softmodem.c
+++ b/targets/RT/USER/lte-softmodem.c
@@ -151,6 +151,8 @@ uint8_t usim_test = 0;
 uint8_t nb_antenna_tx = 1;
 uint8_t nb_antenna_rx = 1;
 
+int16_t dlsch_demod_shift = 0;
+
 char ref[128] = "internal";
 char channels[128] = "0";
 
@@ -635,6 +637,7 @@ static void get_options (int argc, char **argv) {
         LONG_OPTION_THREADIQ,
         LONG_OPTION_THREADODDSUBFRAME,
         LONG_OPTION_THREADEVENSUBFRAME,
+        LONG_OPTION_DEMOD_SHIFT,
 #if T_TRACER
         LONG_OPTION_T_PORT,
         LONG_OPTION_T_NOWAIT,
@@ -670,6 +673,7 @@ static void get_options (int argc, char **argv) {
         {"threadIQ",  required_argument, NULL, LONG_OPTION_THREADIQ},
         {"threadOddSubframe",  required_argument, NULL, LONG_OPTION_THREADODDSUBFRAME},
         {"threadEvenSubframe",  required_argument, NULL, LONG_OPTION_THREADEVENSUBFRAME},
+        {"dlsch-demod-shift", required_argument,  NULL, LONG_OPTION_DEMOD_SHIFT},
 #if T_TRACER
         {"T_port",                 required_argument, 0, LONG_OPTION_T_PORT},
         {"T_nowait",               no_argument,       0, LONG_OPTION_T_NOWAIT},
@@ -800,7 +804,9 @@ static void get_options (int argc, char **argv) {
     case LONG_OPTION_THREADEVENSUBFRAME:
        threads.even=atoi(optarg);
        break;
-
+    case LONG_OPTION_DEMOD_SHIFT:
+        dlsch_demod_shift = atof(optarg);
+        break;
 #if T_TRACER
         case LONG_OPTION_T_PORT: {
             extern int T_port;
-- 
2.26.2