Remove SIMD operation for DMRS amplitude scaling

There was concern that someone in the future might call the function with the wrong inputs. So, based on the discussion on the MR below https://gitlab.eurecom.fr/oai/openairinterface5g/-/merge_requests/2916#note_132506 the SIMD for multiplying DMRS vector with amplitude is removed and is now done with plain C (no SIMD). The execution time for the function dmrs_amp_mult() in this commit is 10x higher that the previous commit.

Remove SIMD operation for DMRS amplitude scaling
There was concern that someone in the future might call the function with the wrong inputs. So, based on the discussion on the MR below https://gitlab.eurecom.fr/oai/openairinterface5g/-/merge_requests/2916#note_132506 the SIMD for multiplying DMRS vector with amplitude is removed and is now done with plain C (no SIMD). The execution time for the function dmrs_amp_mult() in this commit is 10x higher that the previous commit.
26b3fcca · Sakthivel Velumani · francescomani · 393363e2 · 26b3fcca · 26b3fcca
Commit 26b3fcca authored Aug 29, 2024 by Sakthivel Velumani Committed by francescomani Sep 03, 2024
3 changed files
--- a/openair1/PHY/NR_UE_TRANSPORT/nr_ulsch_ue.c
+++ b/openair1/PHY/NR_UE_TRANSPORT/nr_ulsch_ue.c
@@ -386,18 +386,17 @@ static void dmrs_amp_mult(const uint32_t dmrs_port,
                          const uint32_t n_dmrs,
                          const pusch_dmrs_type_t dmrs_type)
 {
-  /*
-  A short array that holds amplitude values used for
-  multiplying with the modulated DMRS vector in 128bit SIMD.
-  */
-  int16_t alpha_dmrs[8] __attribute((aligned(16)));
-  for (int_fast8_t i = 0; i < sizeofArray(alpha_dmrs) / 2; i++) {
-    const int16_t a = Wf[i % 2] * Wt * AMP;
-    alpha_dmrs[2 * i]     = a; // multiplier for real part of DMRS symbol
-    alpha_dmrs[2 * i + 1] = a; // multiplier for img part of DMRS symbol
+  /* short array that hold amplitude for k_prime = 0 and k_prime = 1 */
+  int32_t alpha_dmrs[2] __attribute((aligned(16)));
+  for (int_fast8_t i = 0; i < sizeofArray(alpha_dmrs); i++) {
+    const int32_t a = Wf[i] * Wt * AMP;
+    alpha_dmrs[i] = a;
+  }
+
+  /* multiply amplitude with complex DMRS vector */
+  for (int_fast16_t i = 0; i < n_dmrs; i++) {
+    mod_dmrs_out[i] = c16mulRealShift(mod_dmrs[i], alpha_dmrs[i % 2], 15);
  }
-  /* multiply mod_dmrs with alpha_dmrs in 4 symbol patches */
-  mult_real_vector_single_vector(mod_dmrs, alpha_dmrs, mod_dmrs_out, n_dmrs);
 }

 /*

--- a/openair1/PHY/TOOLS/cmult_sv.c
+++ b/openair1/PHY/TOOLS/cmult_sv.c
@@ -24,18 +24,6 @@
 #include <simde/simde-common.h>
 #include <simde/x86/sse.h>

-void mult_real_vector_single_vector(const c16_t *x, const int16_t *alpha, c16_t *y, const unsigned int N)
-{
-  const simd_q15_t *alpha_128 = (const simd_q15_t *)alpha;
-  const simd_q15_t *x_128 = (const simd_q15_t *)x;
-  simd_q15_t *y_128 = (simd_q15_t *)y;
-  const unsigned int num_adds = (N + 3) / 4; // ceil(N/4)
-
-  for (uint_fast32_t n = 0; n < num_adds; n++) {
-    y_128[n] = mulhi_int16(x_128[n], *alpha_128);
-  }
-}
-
 void multadd_complex_vector_real_scalar(int16_t *x,
                                        int16_t alpha,
                                        int16_t *y,

--- a/openair1/PHY/TOOLS/tools_defs.h
+++ b/openair1/PHY/TOOLS/tools_defs.h
@@ -824,8 +824,6 @@ double interp(double x, double *xs, double *ys, int count);
 void simde_mm128_separate_real_imag_parts(simde__m128i *out_re, simde__m128i *out_im, simde__m128i in0, simde__m128i in1);
 void simde_mm256_separate_real_imag_parts(simde__m256i *out_re, simde__m256i *out_im, simde__m256i in0, simde__m256i in1);

-void mult_real_vector_single_vector(const c16_t *x, const int16_t *alpha, c16_t *y, const unsigned int N);
-
 #ifdef __cplusplus
 }
 #endif