diff --git a/openair1/PHY/NR_UE_TRANSPORT/csi_rx.c b/openair1/PHY/NR_UE_TRANSPORT/csi_rx.c
index 5465e228743ab3ea3932955419ab54534a725d8f..6b7c04506198df6857795b906af8b78555e5be53 100644
--- a/openair1/PHY/NR_UE_TRANSPORT/csi_rx.c
+++ b/openair1/PHY/NR_UE_TRANSPORT/csi_rx.c
@@ -372,6 +372,12 @@ int nr_csi_rs_ri_estimation(PHY_VARS_NR_UE *ue,
   *            | conjch01 conjch11 |   | ch10 ch11 |   | conjch01*ch00+conjch11*ch10 conjch01*ch01+conjch11*ch11 |
   */
 
+  for(uint16_t port_tx_conjch = 0; port_tx_conjch < nr_csi_rs_info->N_ports; port_tx_conjch++) {
+    for(uint16_t port_tx_ch = 0; port_tx_ch < nr_csi_rs_info->N_ports; port_tx_ch++) {
+      memset(nr_csi_rs_info->csi_rs_estimated_A_MF[port_tx_conjch][port_tx_ch],0,NR_MAX_OFDM_SYMBOL_SIZE*sizeof(int32_t));
+    }
+  }
+
   for (int rb = csirs_config_pdu->start_rb; rb < (csirs_config_pdu->start_rb+csirs_config_pdu->nr_of_rbs); rb++) {
 
     if (csirs_config_pdu->freq_density <= 1 && csirs_config_pdu->freq_density != (rb % 2)) {
@@ -379,66 +385,48 @@ int nr_csi_rs_ri_estimation(PHY_VARS_NR_UE *ue,
     }
     uint16_t k = (frame_parms->first_carrier_offset + rb*NR_NB_SC_PER_RB) % frame_parms->ofdm_symbol_size;
 
-    // conjch x ch computation
     for (int ant_rx_conjch = 0; ant_rx_conjch < frame_parms->nb_antennas_rx; ant_rx_conjch++) {
       for(uint16_t port_tx_conjch = 0; port_tx_conjch < nr_csi_rs_info->N_ports; port_tx_conjch++) {
         for (int ant_rx_ch = 0; ant_rx_ch < frame_parms->nb_antennas_rx; ant_rx_ch++) {
           for(uint16_t port_tx_ch = 0; port_tx_ch < nr_csi_rs_info->N_ports; port_tx_ch++) {
+
+            // conjch x ch computation
             nr_conjch0_mult_ch1(&csi_rs_estimated_channel_freq[ant_rx_conjch][port_tx_conjch][k],
                                 &csi_rs_estimated_channel_freq[ant_rx_ch][port_tx_ch][k],
                                 &nr_csi_rs_info->csi_rs_estimated_conjch_ch[ant_rx_conjch][port_tx_conjch][ant_rx_ch][port_tx_ch][k],
                                 1,
                                 0);
+
+            // construct Hh x H elements
+            if(ant_rx_conjch == ant_rx_ch) {
+              nr_a_sum_b((__m128i *)&nr_csi_rs_info->csi_rs_estimated_A_MF[port_tx_conjch][port_tx_ch][k],
+                         (__m128i *)&nr_csi_rs_info->csi_rs_estimated_conjch_ch[ant_rx_conjch][port_tx_conjch][ant_rx_ch][port_tx_ch][k],
+                         1);
+            }
           }
         }
       }
     }
 
-    // construct Hh x H elements
-    nr_construct_HhH_elements(0 < frame_parms->nb_antennas_rx && 0 < nr_csi_rs_info->N_ports ? &nr_csi_rs_info->csi_rs_estimated_conjch_ch[0][0][0][0][k] : NULL,
-                              0 < frame_parms->nb_antennas_rx && 1 < nr_csi_rs_info->N_ports ? &nr_csi_rs_info->csi_rs_estimated_conjch_ch[0][1][0][1][k] : NULL,
-                              1 < frame_parms->nb_antennas_rx && 1 < nr_csi_rs_info->N_ports ? &nr_csi_rs_info->csi_rs_estimated_conjch_ch[1][1][1][1][k] : NULL,
-                              1 < frame_parms->nb_antennas_rx && 0 < nr_csi_rs_info->N_ports ? &nr_csi_rs_info->csi_rs_estimated_conjch_ch[1][0][1][0][k] : NULL,
-                              2 < frame_parms->nb_antennas_rx && 0 < nr_csi_rs_info->N_ports ? &nr_csi_rs_info->csi_rs_estimated_conjch_ch[2][0][2][0][k] : NULL,
-                              2 < frame_parms->nb_antennas_rx && 1 < nr_csi_rs_info->N_ports ? &nr_csi_rs_info->csi_rs_estimated_conjch_ch[2][1][2][1][k] : NULL,
-                              3 < frame_parms->nb_antennas_rx && 0 < nr_csi_rs_info->N_ports ? &nr_csi_rs_info->csi_rs_estimated_conjch_ch[3][0][3][0][k] : NULL,
-                              3 < frame_parms->nb_antennas_rx && 1 < nr_csi_rs_info->N_ports ? &nr_csi_rs_info->csi_rs_estimated_conjch_ch[3][1][3][1][k] : NULL,
-                              0 < frame_parms->nb_antennas_rx && 1 < nr_csi_rs_info->N_ports ? &nr_csi_rs_info->csi_rs_estimated_conjch_ch[0][0][0][1][k] : NULL,
-                              0 < frame_parms->nb_antennas_rx && 1 < nr_csi_rs_info->N_ports ? &nr_csi_rs_info->csi_rs_estimated_conjch_ch[0][1][0][0][k] : NULL,
-                              1 < frame_parms->nb_antennas_rx && 1 < nr_csi_rs_info->N_ports ? &nr_csi_rs_info->csi_rs_estimated_conjch_ch[1][0][1][1][k] : NULL,
-                              1 < frame_parms->nb_antennas_rx && 1 < nr_csi_rs_info->N_ports ? &nr_csi_rs_info->csi_rs_estimated_conjch_ch[1][1][1][0][k] : NULL,
-                              2 < frame_parms->nb_antennas_rx && 1 < nr_csi_rs_info->N_ports ? &nr_csi_rs_info->csi_rs_estimated_conjch_ch[2][0][2][1][k] : NULL,
-                              2 < frame_parms->nb_antennas_rx && 1 < nr_csi_rs_info->N_ports ? &nr_csi_rs_info->csi_rs_estimated_conjch_ch[2][1][2][0][k] : NULL,
-                              3 < frame_parms->nb_antennas_rx && 1 < nr_csi_rs_info->N_ports ? &nr_csi_rs_info->csi_rs_estimated_conjch_ch[3][0][3][1][k] : NULL,
-                              3 < frame_parms->nb_antennas_rx && 1 < nr_csi_rs_info->N_ports ? &nr_csi_rs_info->csi_rs_estimated_conjch_ch[3][1][3][0][k] : NULL,
-                              &nr_csi_rs_info->csi_rs_estimated_A_MF[0][0][k],
-                              &nr_csi_rs_info->csi_rs_estimated_A_MF[0][1][k],
-                              &nr_csi_rs_info->csi_rs_estimated_A_MF[1][0][k],
-                              &nr_csi_rs_info->csi_rs_estimated_A_MF[1][1][k],
-                              1,
-                              0);
-
     // compute the determinant of A_MF (denominator)
-    nr_det_HhH(&nr_csi_rs_info->csi_rs_estimated_A_MF[0][0][k],
-               &nr_csi_rs_info->csi_rs_estimated_A_MF[0][1][k],
-               &nr_csi_rs_info->csi_rs_estimated_A_MF[1][0][k],
-               &nr_csi_rs_info->csi_rs_estimated_A_MF[1][1][k],
-               &nr_csi_rs_info->csi_rs_estimated_determ_fin[k],
-               1,
-               0,
-               0);
+    nr_det_A_MF_2x2(&nr_csi_rs_info->csi_rs_estimated_A_MF[0][0][k],
+                    &nr_csi_rs_info->csi_rs_estimated_A_MF[0][1][k],
+                    &nr_csi_rs_info->csi_rs_estimated_A_MF[1][0][k],
+                    &nr_csi_rs_info->csi_rs_estimated_A_MF[1][1][k],
+                    &nr_csi_rs_info->csi_rs_estimated_determ_fin[k],
+                    1);
 
     // compute the square of A_MF (numerator)
-    squared_matrix_element(&nr_csi_rs_info->csi_rs_estimated_A_MF[0][0][k], &nr_csi_rs_info->csi_rs_estimated_A_MF_sq[0][0][k], 1);
-    squared_matrix_element(&nr_csi_rs_info->csi_rs_estimated_A_MF[0][1][k], &nr_csi_rs_info->csi_rs_estimated_A_MF_sq[0][1][k], 1);
-    squared_matrix_element(&nr_csi_rs_info->csi_rs_estimated_A_MF[1][0][k], &nr_csi_rs_info->csi_rs_estimated_A_MF_sq[1][0][k], 1);
-    squared_matrix_element(&nr_csi_rs_info->csi_rs_estimated_A_MF[1][1][k], &nr_csi_rs_info->csi_rs_estimated_A_MF_sq[1][1][k], 1);
-    numer(&nr_csi_rs_info->csi_rs_estimated_A_MF_sq[0][0][k],
-          &nr_csi_rs_info->csi_rs_estimated_A_MF_sq[0][1][k],
-          &nr_csi_rs_info->csi_rs_estimated_A_MF_sq[1][0][k],
-          &nr_csi_rs_info->csi_rs_estimated_A_MF_sq[1][1][k],
-          &nr_csi_rs_info->csi_rs_estimated_numer_fin[k],
-          1);
+    nr_squared_matrix_element(&nr_csi_rs_info->csi_rs_estimated_A_MF[0][0][k], &nr_csi_rs_info->csi_rs_estimated_A_MF_sq[0][0][k], 1);
+    nr_squared_matrix_element(&nr_csi_rs_info->csi_rs_estimated_A_MF[0][1][k], &nr_csi_rs_info->csi_rs_estimated_A_MF_sq[0][1][k], 1);
+    nr_squared_matrix_element(&nr_csi_rs_info->csi_rs_estimated_A_MF[1][0][k], &nr_csi_rs_info->csi_rs_estimated_A_MF_sq[1][0][k], 1);
+    nr_squared_matrix_element(&nr_csi_rs_info->csi_rs_estimated_A_MF[1][1][k], &nr_csi_rs_info->csi_rs_estimated_A_MF_sq[1][1][k], 1);
+    nr_numer_2x2(&nr_csi_rs_info->csi_rs_estimated_A_MF_sq[0][0][k],
+                 &nr_csi_rs_info->csi_rs_estimated_A_MF_sq[0][1][k],
+                 &nr_csi_rs_info->csi_rs_estimated_A_MF_sq[1][0][k],
+                 &nr_csi_rs_info->csi_rs_estimated_A_MF_sq[1][1][k],
+                 &nr_csi_rs_info->csi_rs_estimated_numer_fin[k],
+                 1);
 
     // compute the conditional number
     for (int sc_idx=0; sc_idx < NR_NB_SC_PER_RB; sc_idx++) {
diff --git a/openair1/PHY/NR_UE_TRANSPORT/nr_dlsch_demodulation.c b/openair1/PHY/NR_UE_TRANSPORT/nr_dlsch_demodulation.c
index 69eccaf02e7c541c826a1d99a5c1c4a0faa97d68..07e7e5c3bbe6b7b0c36bb1626676880a35670323 100644
--- a/openair1/PHY/NR_UE_TRANSPORT/nr_dlsch_demodulation.c
+++ b/openair1/PHY/NR_UE_TRANSPORT/nr_dlsch_demodulation.c
@@ -1869,6 +1869,89 @@ void nr_dlsch_detection_mrc(int **rxdataF_comp,
 #endif
 }
 
+void nr_det_A_MF_2x2(int32_t *a_mf_00,
+                     int32_t *a_mf_01,
+                     int32_t *a_mf_10,
+                     int32_t *a_mf_11,
+                     int32_t *det_fin,
+                     unsigned short nb_rb) {
+
+  int16_t nr_conjug2[8]__attribute__((aligned(16))) = {1,-1,1,-1,1,-1,1,-1} ;
+
+  __m128i ad_re_128, bc_re_128, det_re_128;
+
+  __m128i *a_mf_00_128 = (__m128i *)a_mf_00;
+  __m128i *a_mf_01_128 = (__m128i *)a_mf_01;
+  __m128i *a_mf_10_128 = (__m128i *)a_mf_10;
+  __m128i *a_mf_11_128 = (__m128i *)a_mf_11;
+  __m128i *det_fin_128 = (__m128i *)det_fin;
+
+  for (int rb = 0; rb<3*nb_rb; rb++) {
+
+    //complex multiplication (I_a+jQ_a)(I_d+jQ_d) = (I_aI_d - Q_aQ_d) + j(Q_aI_d + I_aQ_d)
+    //The imag part is often zero, we compute only the real part
+    ad_re_128 = _mm_sign_epi16(a_mf_00_128[0],*(__m128i*)&nr_conjug2[0]);
+    ad_re_128 = _mm_madd_epi16(ad_re_128,a_mf_11_128[0]); //Re: I_a0*I_d0 - Q_a1*Q_d1
+
+    //complex multiplication (I_b+jQ_b)(I_c+jQ_c) = (I_bI_c - Q_bQ_c) + j(Q_bI_c + I_bQ_c)
+    //The imag part is often zero, we compute only the real part
+    bc_re_128 = _mm_sign_epi16(a_mf_01_128[0],*(__m128i*)&nr_conjug2[0]);
+    bc_re_128 = _mm_madd_epi16(bc_re_128,a_mf_10_128[0]); //Re: I_b0*I_c0 - Q_b1*Q_c1
+
+    det_re_128 = _mm_sub_epi32(ad_re_128, bc_re_128);
+
+    //det in Q30 format
+    det_fin_128[0] = _mm_abs_epi32(det_re_128);
+
+    det_fin_128+=1;
+    a_mf_00_128+=1;
+    a_mf_01_128+=1;
+    a_mf_10_128+=1;
+    a_mf_11_128+=1;
+  }
+  _mm_empty();
+  _m_empty();
+}
+
+void nr_squared_matrix_element(int32_t *a,
+                               int32_t *a_sq,
+                               unsigned short nb_rb) {
+  __m128i *a_128 = (__m128i *)a;
+  __m128i *a_sq_128 = (__m128i *)a_sq;
+  for (int rb=0; rb<3*nb_rb; rb++) {
+    a_sq_128[0] = _mm_madd_epi16(a_128[0], a_128[0]);
+    a_sq_128+=1;
+    a_128+=1;
+  }
+  _mm_empty();
+  _m_empty();
+}
+
+void nr_numer_2x2(int32_t *a_00_sq,
+                  int32_t *a_01_sq,
+                  int32_t *a_10_sq,
+                  int32_t *a_11_sq,
+                  int32_t *num_fin,
+                  unsigned short nb_rb) {
+  __m128i *a_00_sq_128 = (__m128i *)a_00_sq;
+  __m128i *a_01_sq_128 = (__m128i *)a_01_sq;
+  __m128i *a_10_sq_128 = (__m128i *)a_10_sq;
+  __m128i *a_11_sq_128 = (__m128i *)a_11_sq;
+  __m128i *num_fin_128 = (__m128i *)num_fin;
+  for (int rb=0; rb<3*nb_rb; rb++) {
+    __m128i sq_a_plus_sq_d_128 = _mm_add_epi32(a_00_sq_128[0], a_11_sq_128[0]);
+    __m128i sq_b_plus_sq_c_128 = _mm_add_epi32(a_01_sq_128[0], a_10_sq_128[0]);
+    num_fin_128[0] = _mm_add_epi32(sq_a_plus_sq_d_128, sq_b_plus_sq_c_128);
+    num_fin_128+=1;
+    a_00_sq_128+=1;
+    a_01_sq_128+=1;
+    a_10_sq_128+=1;
+    a_11_sq_128+=1;
+  }
+  _mm_empty();
+  _m_empty();
+}
+
 /* Zero Forcing Rx function: nr_a_sum_b()
  * Compute the complex addition x=x+y
  *
diff --git a/openair1/PHY/NR_UE_TRANSPORT/nr_transport_proto_ue.h b/openair1/PHY/NR_UE_TRANSPORT/nr_transport_proto_ue.h
index e1df279abf47f7c5935bcef0e0a7d3e102e23419..c2bf41c92321711a462e5dfea9e36b81efb8f9ce 100644
--- a/openair1/PHY/NR_UE_TRANSPORT/nr_transport_proto_ue.h
+++ b/openair1/PHY/NR_UE_TRANSPORT/nr_transport_proto_ue.h
@@ -851,10 +851,6 @@ void construct_HhH_elements(int *ch0conj_ch0,
                          int32_t *after_mf_11,
                          unsigned short nb_rb);
 
-void squared_matrix_element(int32_t *Hh_h_00,
-                            int32_t *Hh_h_00_sq,
-                            unsigned short nb_rb);
-
 void dlsch_channel_level_TM34_meas(int *ch00,
                                    int *ch01,
                                    int *ch10,
@@ -881,19 +877,33 @@ void nr_dlsch_detection_mrc(int **rxdataF_comp,
                             unsigned short nb_rb,
                             int length);
 
-void det_HhH(int32_t *after_mf_00,
-             int32_t *after_mf_01,
-             int32_t *after_mf_10,
-             int32_t *after_mf_11,
-             int32_t *det_fin_128,
-             unsigned short nb_rb);
-
-void numer(int32_t *Hh_h_00_sq,
-           int32_t *Hh_h_01_sq,
-           int32_t *Hh_h_10_sq,
-           int32_t *Hh_h_11_sq,
-           int32_t *num_fin,
-           unsigned short nb_rb);
+void nr_conjch0_mult_ch1(int *ch0,
+                         int *ch1,
+                         int32_t *ch0conj_ch1,
+                         unsigned short nb_rb,
+                         unsigned char output_shift0);
+
+void nr_a_sum_b(__m128i *input_x,
+                __m128i *input_y,
+                unsigned short nb_rb);
+
+void nr_det_A_MF_2x2(int32_t *a_mf_00,
+                     int32_t *a_mf_01,
+                     int32_t *a_mf_10,
+                     int32_t *a_mf_11,
+                     int32_t *det_fin,
+                     unsigned short nb_rb);
+
+void nr_squared_matrix_element(int32_t *a,
+                               int32_t *a_sq,
+                               unsigned short nb_rb);
+
+void nr_numer_2x2(int32_t *a_00_sq,
+                  int32_t *a_01_sq,
+                  int32_t *a_10_sq,
+                  int32_t *a_11_sq,
+                  int32_t *num_fin,
+                  unsigned short nb_rb);
 
 uint8_t rank_estimation_tm3_tm4(int *dl_ch_estimates_00,
                                 int *dl_ch_estimates_01,