Merge remote-tracking branch 'origin/better-dot_product' into integration_2022_wk48

21049fa1 · Robert Schmidt · 0f17e739 · d74b2bd1 · 21049fa1 · 21049fa1
Commit 21049fa1 authored Nov 30, 2022 by Robert Schmidt
6 changed files
--- a/openair1/PHY/LTE_ESTIMATION/lte_est_freq_offset.c
+++ b/openair1/PHY/LTE_ESTIMATION/lte_est_freq_offset.c
@@ -35,8 +35,7 @@ int32x4_t avg128F;
 #endif

 //compute average channel_level on each (TX,RX) antenna pair
-int dl_channel_level(int16_t *dl_ch,
-                     LTE_DL_FRAME_PARMS *frame_parms)
+int dl_channel_level(c16_t *dl_ch, LTE_DL_FRAME_PARMS *frame_parms)
 {

  int16_t rb;
@@ -100,13 +99,9 @@ int lte_est_freq_offset(int **dl_ch_estimates,
                        int* freq_offset,
 			int reset)
 {
-
-  int ch_offset, omega, dl_ch_shift;
-  c16_t omega_cpx;
+  int ch_offset;
  double phase_offset;
  int freq_offset_est;
-  unsigned char aa;
-  int16_t *dl_ch,*dl_ch_prev;
  static int first_run = 1;
  int coef = 1<<10;
  int ncoef =  32767 - coef;
@@ -124,49 +119,40 @@ int lte_est_freq_offset(int **dl_ch_estimates,

  phase_offset = 0.0;

-  //  for (aa=0;aa<frame_parms->nb_antennas_rx*frame_parms->nb_antennas_tx;aa++) {
-  for (aa=0; aa<1; aa++) {
-
-    dl_ch = (int16_t *)&dl_ch_estimates[aa][12+ch_offset];
+  // Warning: only one antenna used
+  for (int aa = 0; aa < 1; aa++) {
+    c16_t *dl_ch = (c16_t *)&dl_ch_estimates[aa][12 + ch_offset];

-    dl_ch_shift = 6+(log2_approx(dl_channel_level(dl_ch,frame_parms))/2);
-    //    printf("dl_ch_shift: %d\n",dl_ch_shift);
+    int dl_ch_shift = 6 + (log2_approx(dl_channel_level(dl_ch, frame_parms)) / 2);

+    c16_t *dl_ch_prev;
    if (ch_offset == 0)
-      dl_ch_prev = (int16_t *)&dl_ch_estimates[aa][12+(4-frame_parms->Ncp)*(frame_parms->ofdm_symbol_size)];
+      dl_ch_prev = (c16_t *)&dl_ch_estimates[aa][12 + (4 - frame_parms->Ncp) * (frame_parms->ofdm_symbol_size)];
    else
-      dl_ch_prev = (int16_t *)&dl_ch_estimates[aa][12+0];
-
+      dl_ch_prev = (c16_t *)&dl_ch_estimates[aa][12 + 0];

    // calculate omega = angle(conj(dl_ch)*dl_ch_prev))
    //    printf("Computing freq_offset\n");
-    omega = dot_product(dl_ch,dl_ch_prev,(frame_parms->N_RB_DL/2 - 1)*12,dl_ch_shift);
-    //omega = dot_product(dl_ch,dl_ch_prev,frame_parms->ofdm_symbol_size,15);
-    omega_cpx.r = ((c16_t*) &omega)->r;
-    omega_cpx.i = ((c16_t*) &omega)->i;
+    c32_t omega_tmp = dot_product(dl_ch, dl_ch_prev, (frame_parms->N_RB_DL / 2 - 1) * 12, dl_ch_shift);
+    cd_t omega_cpx = {omega_tmp.r, omega_tmp.i};
+    // omega = dot_product(dl_ch,dl_ch_prev,frame_parms->ofdm_symbol_size,15);

-    dl_ch = (int16_t *)&dl_ch_estimates[aa][(((frame_parms->N_RB_DL/2) + 1)*12) + ch_offset];
+    dl_ch = (c16_t *)&dl_ch_estimates[aa][(((frame_parms->N_RB_DL / 2) + 1) * 12) + ch_offset];

    if (ch_offset == 0)
-      dl_ch_prev = (int16_t *)&dl_ch_estimates[aa][(((frame_parms->N_RB_DL/2) + 1)*12)+(4-frame_parms->Ncp)*(frame_parms->ofdm_symbol_size)];
+      dl_ch_prev = (c16_t *)&dl_ch_estimates[aa][(((frame_parms->N_RB_DL / 2) + 1) * 12) + (4 - frame_parms->Ncp) * (frame_parms->ofdm_symbol_size)];
    else
-      dl_ch_prev = (int16_t *)&dl_ch_estimates[aa][((frame_parms->N_RB_DL/2) + 1)*12];
+      dl_ch_prev = (c16_t *)&dl_ch_estimates[aa][((frame_parms->N_RB_DL / 2) + 1) * 12];

    // calculate omega = angle(conj(dl_ch)*dl_ch_prev))
-    omega = dot_product(dl_ch,dl_ch_prev,((frame_parms->N_RB_DL/2) - 1)*12,dl_ch_shift);
-
-    omega_cpx.r += ((c16_t*) &omega)->r;
-    omega_cpx.i += ((c16_t*) &omega)->i;
+    c32_t omega = dot_product(dl_ch, dl_ch_prev, ((frame_parms->N_RB_DL / 2) - 1) * 12, dl_ch_shift);

-    //    phase_offset += atan2((double)omega_cpx->i,(double)omega_cpx->r);
-    phase_offset += atan2((double)omega_cpx.i,(double)omega_cpx.r);
-    //    LOG_I(PHY,"omega (%d,%d) -> %f\n",omega_cpx->r,omega_cpx->i,phase_offset);
+    omega_cpx.r += omega.r;
+    omega_cpx.i += omega.i;
+    phase_offset += atan2(omega_cpx.i, omega_cpx.r);
  }

-  //  phase_offset /= (frame_parms->nb_antennas_rx*frame_parms->nb_antennas_tx);
-
  freq_offset_est = (int) (phase_offset/(2*M_PI)/(frame_parms->Ncp==NORMAL ? (285.8e-6):(2.5e-4))); //2.5e-4 is the time between pilot symbols
-  //  LOG_I(PHY,"symbol %d : freq_offset_est %d\n",l,freq_offset_est);

  // update freq_offset with phase_offset using a moving average filter
  if (first_run == 1) {
@@ -175,14 +161,6 @@ int lte_est_freq_offset(int **dl_ch_estimates,
  } else
    *freq_offset = ((freq_offset_est * coef) + (*freq_offset * ncoef)) >> 15;

-  //#ifdef DEBUG_PHY
-  //    msg("l=%d, phase_offset = %f (%d,%d), freq_offset_est = %d Hz, freq_offset_filt = %d \n",l,phase_offset,omega_cpx->r,omega_cpx->i,freq_offset_est,*freq_offset);
-  /*
-  for (i=0;i<150;i++)
-    msg("i %d : %d,%d <=> %d,%d\n",i,dl_ch[2*i],dl_ch[(2*i)+1],dl_ch_prev[2*i],dl_ch_prev[(2*i)+1]);
-  */
-  //#endif
-
  return(0);
 }
 //******************************************************************************************************
@@ -193,13 +171,9 @@ int lte_mbsfn_est_freq_offset(int **dl_ch_estimates,
                              int l,
                              int* freq_offset)
 {
-
-  int ch_offset, omega, dl_ch_shift;
-  c16_t *omega_cpx;
+  int ch_offset;
  double phase_offset;
  int freq_offset_est;
-  unsigned char aa;
-  int16_t *dl_ch,*dl_ch_prev;
  static int first_run = 1;
  int coef = 1<<10;
  int ncoef =  32767 - coef;
@@ -213,52 +187,38 @@ int lte_mbsfn_est_freq_offset(int **dl_ch_estimates,
  }

  phase_offset = 0.0;
+  
+  // Warning: only one antenna used
+  for (int aa = 0; aa < 1; aa++) {
+    c16_t *dl_ch_prev;
+    c16_t *dl_ch = (c16_t *)&dl_ch_estimates[aa][12 + ch_offset];

-  //  for (aa=0;aa<frame_parms->nb_antennas_rx*frame_parms->nb_antennas_tx;aa++) {
-  for (aa=0; aa<1; aa++) {
-
-    dl_ch = (int16_t *)&dl_ch_estimates[aa][12+ch_offset];
-
-    dl_ch_shift = 4+(log2_approx(dl_channel_level(dl_ch,frame_parms))/2);
+    int dl_ch_shift = 4 + (log2_approx(dl_channel_level(dl_ch, frame_parms)) / 2);
    //    printf("dl_ch_shift: %d\n",dl_ch_shift);

    if (ch_offset == 0)
-      dl_ch_prev = (int16_t *)&dl_ch_estimates[aa][12+(10*(frame_parms->ofdm_symbol_size))];
+      dl_ch_prev = (c16_t *)&dl_ch_estimates[aa][12 + (10 * (frame_parms->ofdm_symbol_size))];
    else
-      dl_ch_prev = (int16_t *)&dl_ch_estimates[aa][12+6];
-
-    //else if
-    // dl_ch_prev = (int16_t *)&dl_ch_estimates[aa][12+0];
-
-    // calculate omega = angle(conj(dl_ch)*dl_ch_prev))
-    //    printf("Computing freq_offset\n");
-    omega = dot_product(dl_ch,dl_ch_prev,(frame_parms->N_RB_DL/2 - 1)*12,dl_ch_shift);
-    //omega = dot_product(dl_ch,dl_ch_prev,frame_parms->ofdm_symbol_size,15);
-    omega_cpx = (c16_t*) &omega;
+      dl_ch_prev = (c16_t *)&dl_ch_estimates[aa][12 + 6];

-    //    printf("omega (%d,%d)\n",omega_cpx->r,omega_cpx->i);
+    const c32_t omega_tmp = dot_product(dl_ch, dl_ch_prev, (frame_parms->N_RB_DL / 2 - 1) * 12, dl_ch_shift);
+    cd_t omega_cpx = {omega_tmp.r, omega_tmp.i};

-    dl_ch = (int16_t *)&dl_ch_estimates[aa][(((frame_parms->N_RB_DL/2) + 1)*12) + ch_offset];
+    dl_ch = (c16_t *)&dl_ch_estimates[aa][(((frame_parms->N_RB_DL / 2) + 1) * 12) + ch_offset];

    if (ch_offset == 0)
-      dl_ch_prev = (int16_t *)&dl_ch_estimates[aa][(((frame_parms->N_RB_DL/2) + 1)*12)+10*(frame_parms->ofdm_symbol_size)];
+      dl_ch_prev = (c16_t *)&dl_ch_estimates[aa][(((frame_parms->N_RB_DL / 2) + 1) * 12) + 10 * (frame_parms->ofdm_symbol_size)];
    else
-      dl_ch_prev = (int16_t *)&dl_ch_estimates[aa][((frame_parms->N_RB_DL/2) + 1)*12+6];
+      dl_ch_prev = (c16_t *)&dl_ch_estimates[aa][((frame_parms->N_RB_DL / 2) + 1) * 12 + 6];

    // calculate omega = angle(conj(dl_ch)*dl_ch_prev))
-    omega = dot_product(dl_ch,dl_ch_prev,((frame_parms->N_RB_DL/2) - 1)*12,dl_ch_shift);
-    omega_cpx->r += ((c16_t*) &omega)->r;
-    omega_cpx->i += ((c16_t*) &omega)->i;
-    //    phase_offset += atan2((double)omega_cpx->i,(double)omega_cpx->r);
-    phase_offset += atan2((double)omega_cpx->i,(double)omega_cpx->r);
-    //    printf("omega (%d,%d) -> %f\n",omega_cpx->r,omega_cpx->i,phase_offset);
+    c32_t omega2 = dot_product(dl_ch, dl_ch_prev, ((frame_parms->N_RB_DL / 2) - 1) * 12, dl_ch_shift);
+    omega_cpx.r += omega2.r;
+    omega_cpx.i += omega2.i;
+    phase_offset += atan2(omega_cpx.i, omega_cpx.r);
  }

-  //  phase_offset /= (frame_parms->nb_antennas_rx*frame_parms->nb_antennas_tx);
-
  freq_offset_est = (int) (phase_offset/(2*M_PI)/2.5e-4); //2.5e-4 is the time between pilot symbols
-  //  printf("symbol %d : freq_offset_est %d\n",l,freq_offset_est);
-
  // update freq_offset with phase_offset using a moving average filter
  if (first_run == 1) {
    *freq_offset = freq_offset_est;
@@ -266,13 +226,5 @@ int lte_mbsfn_est_freq_offset(int **dl_ch_estimates,
  } else
    *freq_offset = ((freq_offset_est * coef) + (*freq_offset * ncoef)) >> 15;

-  //#ifdef DEBUG_PHY
-  //    msg("l=%d, phase_offset = %f (%d,%d), freq_offset_est = %d Hz, freq_offset_filt = %d \n",l,phase_offset,omega_cpx->r,omega_cpx->i,freq_offset_est,*freq_offset);
-  /*
-  for (i=0;i<150;i++)
-    msg("i %d : %d,%d <=> %d,%d\n",i,dl_ch[2*i],dl_ch[(2*i)+1],dl_ch_prev[2*i],dl_ch_prev[(2*i)+1]);
-  */
-  //#endif
-
  return(0);
 }
--- a/openair1/PHY/LTE_ESTIMATION/lte_estimation.h
+++ b/openair1/PHY/LTE_ESTIMATION/lte_estimation.h
@@ -75,28 +75,6 @@ void lte_sync_timefreq(PHY_VARS_UE *ue,
                       int band,
                       unsigned int DL_freq);

-
-/*!
-\brief This function performs detection of the PRACH (=SRS) at the eNb to estimate the timing advance
-The algorithm uses a time domain correlation with a downsampled version of the received signal.
-\param rxdata Received time domain data for all rx antennas
-\param frame_parms LTE DL frame parameter structure
-\param length Length for correlation
-\param peak_val pointer to value of returned peak
-\param sync_corr_eNb pointer to correlation buffer
-\return sync_pos Position of the sync within the frame (downsampled) if successfull and -1 if there was an error or no peak was detected.
- */
-int lte_sync_time_eNB(int32_t **rxdata,
-                      LTE_DL_FRAME_PARMS *frame_parms,
-                      uint32_t length,
-                      uint32_t *peak_val,
-                      uint32_t *sync_corr_eNb);
-
-int lte_sync_time_eNB_emul(PHY_VARS_eNB *phy_vars_eNb,
-                           uint8_t sect_id,
-                           int32_t *sync_val);
-
-
 int ru_sync_time_init(RU_t *ru);

 int ru_sync_time(RU_t *ru,

--- a/openair1/PHY/LTE_ESTIMATION/lte_sync_time.c
+++ b/openair1/PHY/LTE_ESTIMATION/lte_sync_time.c
@@ -34,9 +34,7 @@
 // Note: this is for prototype of generate_drs_pusch (OTA synchronization of RRUs)
 #include "PHY/LTE_UE_TRANSPORT/transport_proto_ue.h"

-static c16_t *primary_synch0_time __attribute__((aligned(32)));
-static c16_t *primary_synch1_time __attribute__((aligned(32)));
-static c16_t *primary_synch2_time __attribute__((aligned(32)));
+static c16_t *primary_synch_time[3] __attribute__((aligned(32)));

 static void doIdft(int size, short *in, short *out) {
  switch (size) {
@@ -84,25 +82,23 @@ static void copyPrimary( c16_t *out, struct complex16 *in, int ofdmSize) {

 int lte_sync_time_init(LTE_DL_FRAME_PARMS *frame_parms ) { // LTE_UE_COMMON *common_vars
  c16_t syncF_tmp[2048]__attribute__((aligned(32)))= {{0}};
-  int sz=frame_parms->ofdm_symbol_size*sizeof(*primary_synch0_time);
-  AssertFatal( NULL != (primary_synch0_time = (c16_t *)malloc16(sz)),"");
-  bzero(primary_synch0_time,sz);
-  AssertFatal( NULL != (primary_synch1_time = (c16_t *)malloc16(sz)),"");
-  bzero(primary_synch1_time,sz);
-  AssertFatal( NULL != (primary_synch2_time = (c16_t *)malloc16(sz)),"");
-  bzero(primary_synch2_time,sz);
+  int sz = frame_parms->ofdm_symbol_size * sizeof(**primary_synch_time);
+  for (int i = 0; i < sizeofArray(primary_synch_time); i++) {
+    primary_synch_time[i] = (c16_t *)malloc16_clear(sz);
+    AssertFatal(primary_synch_time[i],"no memory\n");
+  }
  // generate oversampled sync_time sequences
  copyPrimary( syncF_tmp, (c16_t *) primary_synch0, frame_parms->ofdm_symbol_size);
-  doIdft(frame_parms->N_RB_DL, (short *)syncF_tmp,(short *)primary_synch0_time);
+  doIdft(frame_parms->N_RB_DL, (short *)syncF_tmp, (short *)primary_synch_time[0]);
  copyPrimary( syncF_tmp, (c16_t *) primary_synch1, frame_parms->ofdm_symbol_size);
-  doIdft(frame_parms->N_RB_DL, (short *)syncF_tmp,(short *)primary_synch1_time);
+  doIdft(frame_parms->N_RB_DL, (short *)syncF_tmp, (short *)primary_synch_time[1]);
  copyPrimary( syncF_tmp, (c16_t *) primary_synch2, frame_parms->ofdm_symbol_size);
-  doIdft(frame_parms->N_RB_DL, (short *)syncF_tmp,(short *)primary_synch2_time);
+  doIdft(frame_parms->N_RB_DL, (short *)syncF_tmp, (short *)primary_synch_time[2]);

  if ( LOG_DUMPFLAG(DEBUG_LTEESTIM)){
-    LOG_M("primary_sync0.m","psync0",primary_synch0_time,frame_parms->ofdm_symbol_size,1,1);
-    LOG_M("primary_sync1.m","psync1",primary_synch1_time,frame_parms->ofdm_symbol_size,1,1);
-    LOG_M("primary_sync2.m","psync2",primary_synch2_time,frame_parms->ofdm_symbol_size,1,1);
+    LOG_M("primary_sync0.m", "psync0", primary_synch_time[0], frame_parms->ofdm_symbol_size, 1, 1);
+    LOG_M("primary_sync1.m", "psync1", primary_synch_time[1], frame_parms->ofdm_symbol_size, 1, 1);
+    LOG_M("primary_sync2.m", "psync2", primary_synch_time[2], frame_parms->ofdm_symbol_size, 1, 1);
  }

  return (1);
@@ -110,24 +106,11 @@ int lte_sync_time_init(LTE_DL_FRAME_PARMS *frame_parms ) { // LTE_UE_COMMON *com


 void lte_sync_time_free(void) {
-  if (primary_synch0_time) {
-    LOG_D(PHY,"Freeing primary_sync0_time ...\n");
-    free(primary_synch0_time);
-  }
-
-  if (primary_synch1_time) {
-    LOG_D(PHY,"Freeing primary_sync1_time ...\n");
-    free(primary_synch1_time);
-  }
-
-  if (primary_synch2_time) {
-    LOG_D(PHY,"Freeing primary_sync2_time ...\n");
-    free(primary_synch2_time);
-  }
-
-  primary_synch0_time = NULL;
-  primary_synch1_time = NULL;
-  primary_synch2_time = NULL;
+  for (int i = 0; i < sizeofArray(primary_synch_time); i++)
+    if (primary_synch_time[i]) {
+      LOG_D(PHY, "Freeing primary_sync0_time ...\n");
+      free_and_zero(primary_synch_time[i]);
+    }
 }


@@ -141,7 +124,6 @@ int lte_sync_time(int **rxdata, ///rx data in time domain
                  int *eNB_id) {
  // perform a time domain correlation using the oversampled sync sequence
  unsigned int n, ar, s, peak_pos, peak_val, sync_source;
-  int result,result2;
  struct complexd sync_out[3]= {{0}}, sync_out2[3]= {{0}};
  int length =   LTE_NUMBER_OF_SUBFRAMES_PER_FRAME*frame_parms->samples_per_tti>>1;
  peak_val = 0;
@@ -157,32 +139,15 @@ int lte_sync_time(int **rxdata, ///rx data in time domain
    //    if (n<(length-frame_parms->ofdm_symbol_size-frame_parms->nb_prefix_samples)) {
    if (n<(length-frame_parms->ofdm_symbol_size)) {
      //calculate dot product of primary_synch0_time and rxdata[ar][n] (ar=0..nb_ant_rx) and store the sum in temp[n];
-      for (ar=0; ar<frame_parms->nb_antennas_rx; ar++) {
-        result  = dot_product((short *)primary_synch0_time, (short *) &(rxdata[ar][n]), frame_parms->ofdm_symbol_size, SHIFT);
-        result2 = dot_product((short *)primary_synch0_time, (short *) &(rxdata[ar][n+length]), frame_parms->ofdm_symbol_size, SHIFT);
-        sync_out[0].r += ((short *) &result)[0];
-        sync_out[0].i += ((short *) &result)[1];
-        sync_out2[0].r += ((short *) &result2)[0];
-        sync_out2[0].i += ((short *) &result2)[1];
-      }
-
-      for (ar=0; ar<frame_parms->nb_antennas_rx; ar++) {
-        result = dot_product((short *)primary_synch1_time, (short *) &(rxdata[ar][n]), frame_parms->ofdm_symbol_size, SHIFT);
-        result2 = dot_product((short *)primary_synch1_time, (short *) &(rxdata[ar][n+length]), frame_parms->ofdm_symbol_size, SHIFT);
-        sync_out[1].r += ((short *) &result)[0];
-        sync_out[1].i += ((short *) &result)[1];
-        sync_out2[1].r += ((short *) &result2)[0];
-        sync_out2[1].i += ((short *) &result2)[1];
-      }
-
-      for (ar=0; ar<frame_parms->nb_antennas_rx; ar++) {
-        result = dot_product((short *)primary_synch2_time, (short *) &(rxdata[ar][n]), frame_parms->ofdm_symbol_size, SHIFT);
-        result2 = dot_product((short *)primary_synch2_time, (short *) &(rxdata[ar][n+length]), frame_parms->ofdm_symbol_size, SHIFT);
-        sync_out[2].r += ((short *) &result)[0];
-        sync_out[2].i += ((short *) &result)[1];
-        sync_out2[2].r += ((short *) &result2)[0];
-        sync_out2[2].i += ((short *) &result2)[1];
-      }
+      for (int i = 0; i < sizeofArray(primary_synch_time); i++)
+        for (ar = 0; ar < frame_parms->nb_antennas_rx; ar++) {
+          c32_t result = dot_product(primary_synch_time[i], (c16_t *)&(rxdata[ar][n]), frame_parms->ofdm_symbol_size, SHIFT);
+          c32_t result2 = dot_product(primary_synch_time[i], (c16_t *)&(rxdata[ar][n + length]), frame_parms->ofdm_symbol_size, SHIFT);
+          sync_out[i].r += result.r;
+          sync_out[i].i += result.i;
+          sync_out2[i].r += result2.r;
+          sync_out2[i].i += result2.i;
+        }
    }

    // calculate the absolute value of sync_corr[n]
@@ -274,88 +239,6 @@ void ru_sync_time_free(RU_t *ru) {
    free(ru->dmrs_corr);
 }

-//#define DEBUG_PHY
-
-int lte_sync_time_eNB(int32_t **rxdata, ///rx data in time domain
-                      LTE_DL_FRAME_PARMS *frame_parms,
-                      uint32_t length,
-                      uint32_t *peak_val_out,
-                      uint32_t *sync_corr_eNB) {
-  // perform a time domain correlation using the oversampled sync sequence
-  unsigned int n, ar, peak_val, peak_pos;
-  uint64_t mean_val;
-  int result;
-  short *primary_synch_time;
-  int eNB_id = frame_parms->Nid_cell%3;
-
-  // LOG_E(PHY,"[SYNC TIME] Calling sync_time_eNB(%p,%p,%d,%d)\n",rxdata,frame_parms,eNB_id,length);
-  if (sync_corr_eNB == NULL) {
-    LOG_E(PHY,"[SYNC TIME] sync_corr_eNB not yet allocated! Exiting.\n");
-    return(-1);
-  }
-
-  switch (eNB_id) {
-    case 0:
-      primary_synch_time = (short *)primary_synch0_time;
-      break;
-
-    case 1:
-      primary_synch_time = (short *)primary_synch1_time;
-      break;
-
-    case 2:
-      primary_synch_time = (short *)primary_synch2_time;
-      break;
-
-    default:
-      LOG_E(PHY,"[SYNC TIME] Illegal eNB_id!\n");
-      return (-1);
-  }
-
-  peak_val = 0;
-  peak_pos = 0;
-  mean_val = 0;
-
-  for (n=0; n<length; n+=4) {
-    sync_corr_eNB[n] = 0;
-
-    if (n<(length-frame_parms->ofdm_symbol_size-frame_parms->nb_prefix_samples)) {
-      //calculate dot product of primary_synch0_time and rxdata[ar][n] (ar=0..nb_ant_rx) and store the sum in temp[n];
-      for (ar=0; ar<frame_parms->nb_antennas_rx; ar++)  {
-        result = dot_product((short *)primary_synch_time, (short *) &(rxdata[ar][n]), frame_parms->ofdm_symbol_size, SHIFT);
-        //((short*)sync_corr)[2*n]   += ((short*) &result)[0];
-        //((short*)sync_corr)[2*n+1] += ((short*) &result)[1];
-        sync_corr_eNB[n] += squaredMod(*(c16_t*)&result);
-      }
-    }
-
-    /*
-    if (eNB_id == 2) {
-      printf("sync_time_eNB %d : %d,%d (%d)\n",n,sync_corr_eNB[n],mean_val,
-       peak_val);
-    }
-    */
-    mean_val += sync_corr_eNB[n];
-
-    if (sync_corr_eNB[n]>peak_val) {
-      peak_val = sync_corr_eNB[n];
-      peak_pos = n;
-    }
-  }
-
-  mean_val/=length;
-  *peak_val_out = peak_val;
-
-  if (peak_val <= (40*(uint32_t)mean_val)) {
-    LOG_I(PHY,"[SYNC TIME] No peak found (%u,%u,%"PRIu64",%"PRIu64")\n",peak_pos,peak_val,mean_val,40*mean_val);
-    return(-1);
-  } else {
-    LOG_I(PHY,"[SYNC TIME] Peak found at pos %u, val = %u, mean_val = %"PRIu64"\n",peak_pos,peak_val,mean_val);
-    return(peak_pos);
-  }
-}
-
-
 int ru_sync_time(RU_t *ru,
                 int64_t *lev,
                 int64_t *avg) {
@@ -372,8 +255,7 @@ int ru_sync_time(RU_t *ru,

  int32_t maxlev0=0;
  int     maxpos0=0;
-  int64_t avg0=0;
-  int64_t result;
+  int64_t avg0 = 0;
  int64_t dmrs_corr;
  int maxval=0;

@@ -396,19 +278,12 @@ int ru_sync_time(RU_t *ru,

    //calculate dot product of primary_synch0_time and rxdata[ar][n] (ar=0..nb_ant_rx) and store the sum in temp[n];
    for (int ar=0; ar<ru->nb_rx; ar++) {
-      result  = dot_product64(ru->dmrssync,
-                              (int16_t *) &ru->common.rxdata[ar][n],
-                              frame_parms->ofdm_symbol_size,
-                              shift);
-
-      if (ru->state == RU_CHECK_SYNC) {
-        result  = dot_product64((int16_t *) &calibration->drs_ch_estimates_time[ar],
-                                (int16_t *) &ru->common.rxdata[ar][n],
-                                frame_parms->ofdm_symbol_size,
-                                shift);
-      }
-
-      dmrs_corr += squaredMod(*(c32_t*)&result);
+      const c16_t *input = ru->state == RU_CHECK_SYNC ?
+        (c16_t *)&calibration->drs_ch_estimates_time[ar] :
+        (c16_t *)ru->dmrssync;
+      c32_t result = dot_product(input, (c16_t *)&ru->common.rxdata[ar][n], frame_parms->ofdm_symbol_size, shift);
+      const c64_t tmp = {result.r, result.i};
+      dmrs_corr += squaredMod(tmp);
    }

    if (ru->dmrs_corr != NULL)

--- a/openair1/PHY/NR_UE_TRANSPORT/pss_nr.c
+++ b/openair1/PHY/NR_UE_TRANSPORT/pss_nr.c
@@ -651,7 +651,6 @@ int pss_synchro_nr(PHY_VARS_NR_UE *PHY_vars_UE, int is, int rate_change)
 *
 *********************************************************************/

-#define DOT_PRODUCT_SCALING_SHIFT    (17)
 int pss_search_time_nr(int **rxdata, ///rx data in time domain
                       NR_DL_FRAME_PARMS *frame_parms,
 		       int fo_flag,
@@ -694,8 +693,7 @@ int pss_search_time_nr(int **rxdata, ///rx data in time domain
  /* Correlation computation is based on a a dot product which is realized thank to SIMS extensions */

  for (int pss_index = 0; pss_index < NUMBER_PSS_SEQUENCE; pss_index++) {
-
-    for (n=0; n < length; n+=4) { //
+    for (n = 0; n < length; n += 8) { //

      int64_t pss_corr_ue=0;
      /* calculate dot product of primary_synchro_time_nr and rxdata[ar][n]
@@ -703,12 +701,8 @@ int pss_search_time_nr(int **rxdata, ///rx data in time domain
      for (ar=0; ar<frame_parms->nb_antennas_rx; ar++) {

        /* perform correlation of rx data and pss sequence ie it is a dot product */
-        const int64_t result = dot_product64((short *)primary_synchro_time_nr[pss_index],
-                                             (short *)&(rxdata[ar][n + is * frame_parms->samples_per_frame]),
-                                             frame_parms->ofdm_symbol_size,
-                                             shift);
-        const c32_t r32 = *(c32_t*)&result;
-        const c64_t r64 = {.r = r32.r, .i = r32.i};
+        const c32_t result = dot_product((c16_t *)primary_synchro_time_nr[pss_index], (c16_t *)&(rxdata[ar][n + is * frame_parms->samples_per_frame]), frame_parms->ofdm_symbol_size, shift);
+        const c64_t r64 = {.r = result.r, .i = result.i};
        pss_corr_ue += squaredMod(r64);
        //((short*)pss_corr_ue[pss_index])[2*n] += ((short*) &result)[0];   /* real part */
        //((short*)pss_corr_ue[pss_index])[2*n+1] += ((short*) &result)[1]; /* imaginary part */
@@ -736,27 +730,17 @@ int pss_search_time_nr(int **rxdata, ///rx data in time domain
 	  // fractional frequency offset computation according to Cross-correlation Synchronization Algorithm Using PSS
 	  // Shoujun Huang, Yongtao Su, Ying He and Shan Tang, "Joint time and frequency offset estimation in LTE downlink," 7th International Conference on Communications and Networking in China, 2012.

-	  int64_t result1,result2;
-	  // Computing cross-correlation at peak on half the symbol size for first half of data
-	  result1  = dot_product64((short*)primary_synchro_time_nr[pss_source], 
-				  (short*) &(rxdata[0][peak_position+is*frame_parms->samples_per_frame]), 
-				  frame_parms->ofdm_symbol_size>>1, 
-				  shift);
-	  // Computing cross-correlation at peak on half the symbol size for data shifted by half symbol size 
-	  // as it is real and complex it is necessary to shift by a value equal to symbol size to obtain such shift
-	  result2  = dot_product64((short*)primary_synchro_time_nr[pss_source]+(frame_parms->ofdm_symbol_size), 
-				  (short*) &(rxdata[0][peak_position+is*frame_parms->samples_per_frame])+frame_parms->ofdm_symbol_size, 
-				  frame_parms->ofdm_symbol_size>>1, 
-				  shift);
-
-	  int64_t re1,re2,im1,im2;
-	  re1=((int*) &result1)[0];
-	  re2=((int*) &result2)[0];
-	  im1=((int*) &result1)[1];
-	  im2=((int*) &result2)[1];
-
- 	  // estimation of fractional frequency offset: angle[(result1)'*(result2)]/pi
-	  ffo_est=atan2(re1*im2-re2*im1,re1*re2+im1*im2)/M_PI;
+    // Computing cross-correlation at peak on half the symbol size for first half of data
+    c32_t r1 = dot_product((c16_t *)primary_synchro_time_nr[pss_source], (c16_t *)&(rxdata[0][peak_position + is * frame_parms->samples_per_frame]), frame_parms->ofdm_symbol_size >> 1, shift);
+    // Computing cross-correlation at peak on half the symbol size for data shifted by half symbol size
+    // as it is real and complex it is necessary to shift by a value equal to symbol size to obtain such shift
+    c32_t r2 = dot_product((c16_t *)primary_synchro_time_nr[pss_source] + (frame_parms->ofdm_symbol_size >> 1),
+                           (c16_t *)&(rxdata[0][peak_position + is * frame_parms->samples_per_frame]) + (frame_parms->ofdm_symbol_size >> 1),
+                           frame_parms->ofdm_symbol_size >> 1,
+                           shift);
+    cd_t r1d = {r1.r, r1.i}, r2d = {r2.r, r2.i};
+    // estimation of fractional frequency offset: angle[(result1)'*(result2)]/pi
+    ffo_est = atan2(r1d.r * r2d.i - r2d.r * r1d.i, r1d.r * r2d.r + r1d.i * r2d.i) / M_PI;

 #ifdef DBG_PSS_NR
 	  printf("ffo %lf\n",ffo_est);

--- a/openair1/PHY/TOOLS/cdot_prod.c
+++ b/openair1/PHY/TOOLS/cdot_prod.c
@@ -29,256 +29,51 @@ void print_ints(char *s,__m128i *x);
 void print_shorts(char *s,__m128i *x);
 void print_bytes(char *s,__m128i *x);
 #endif
-
-int32_t dot_product(int16_t *x,
-                    int16_t *y,
-                    uint32_t N, //must be a multiple of 8
-                    uint8_t output_shift)
-{
-
-  uint32_t n;
-
-#if defined(__x86_64__) || defined(__i386__)
-  __m128i *x128,*y128,mmtmp1,mmtmp2,mmtmp3,mmcumul,mmcumul_re,mmcumul_im;
-  __m64 mmtmp7;
-  __m128i minus_i = _mm_set_epi16(-1,1,-1,1,-1,1,-1,1);
-  int32_t result;
-
-  x128 = (__m128i*) x;
-  y128 = (__m128i*) y;
-
-  mmcumul_re = _mm_setzero_si128();
-  mmcumul_im = _mm_setzero_si128();
-
-  for (n=0; n<(N>>2); n++) {
-
-    //printf("n=%d, x128=%p, y128=%p\n",n,x128,y128);
-    //    print_shorts("x",&x128[0]);
-    //    print_shorts("y",&y128[0]);
-
-    // this computes Re(z) = Re(x)*Re(y) + Im(x)*Im(y)
-    mmtmp1 = _mm_madd_epi16(x128[0],y128[0]);
-    //    print_ints("re",&mmtmp1);
-    // mmtmp1 contains real part of 4 consecutive outputs (32-bit)
-
-    // shift and accumulate results
-    mmtmp1 = _mm_srai_epi32(mmtmp1,output_shift);
-    mmcumul_re = _mm_add_epi32(mmcumul_re,mmtmp1);
-    //    print_ints("re",&mmcumul_re);
-
-
-    // this computes Im(z) = Re(x)*Im(y) - Re(y)*Im(x)
-    mmtmp2 = _mm_shufflelo_epi16(y128[0],_MM_SHUFFLE(2,3,0,1));
-    //    print_shorts("y",&mmtmp2);
-    mmtmp2 = _mm_shufflehi_epi16(mmtmp2,_MM_SHUFFLE(2,3,0,1));
-    //    print_shorts("y",&mmtmp2);
-    mmtmp2 = _mm_sign_epi16(mmtmp2,minus_i);
-    //        print_shorts("y",&mmtmp2);
-
-    mmtmp3 = _mm_madd_epi16(x128[0],mmtmp2);
-    //        print_ints("im",&mmtmp3);
-    // mmtmp3 contains imag part of 4 consecutive outputs (32-bit)
-
-    // shift and accumulate results
-    mmtmp3 = _mm_srai_epi32(mmtmp3,output_shift);
-    mmcumul_im = _mm_add_epi32(mmcumul_im,mmtmp3);
-    //    print_ints("im",&mmcumul_im);
-
-    x128++;
-    y128++;
-  }
-
-  // this gives Re Re Im Im
-  mmcumul = _mm_hadd_epi32(mmcumul_re,mmcumul_im);
-  //  print_ints("cumul1",&mmcumul);
-
-  // this gives Re Im Re Im
-  mmcumul = _mm_hadd_epi32(mmcumul,mmcumul);
-
-  //  print_ints("cumul2",&mmcumul);
-
-
-  //mmcumul = _mm_srai_epi32(mmcumul,output_shift);
-  // extract the lower half
-  mmtmp7 = _mm_movepi64_pi64(mmcumul);
-  //  print_ints("mmtmp7",&mmtmp7);
-  // pack the result
-  mmtmp7 = _mm_packs_pi32(mmtmp7,mmtmp7);
-  //  print_shorts("mmtmp7",&mmtmp7);
-  // convert back to integer
-  result = _mm_cvtsi64_si32(mmtmp7);
-  
-  _mm_empty();
-  _m_empty();
- 
-  return(result);
-
-#elif defined(__arm__) || defined(__aarch64__)
-  int16x4_t *x_128=(int16x4_t*)x;
-  int16x4_t *y_128=(int16x4_t*)y;
-  int32x4_t tmp_re,tmp_im;
-  int32x4_t tmp_re1,tmp_im1;
-  int32x4_t re_cumul,im_cumul;
-  int32x2_t re_cumul2,im_cumul2;
-  int32x4_t shift = vdupq_n_s32(-output_shift); 
-  int32x2x2_t result2;
-  int16_t conjug[4]__attribute__((aligned(16))) = {-1,1,-1,1} ;
-
-  re_cumul = vdupq_n_s32(0);
-  im_cumul = vdupq_n_s32(0); 
-
-  for (n=0; n<(N>>2); n++) {
-
-    tmp_re  = vmull_s16(*x_128++, *y_128++);
-    //tmp_re = [Re(x[0])Re(y[0]) Im(x[0])Im(y[0]) Re(x[1])Re(y[1]) Im(x[1])Im(y[1])] 
-    tmp_re1 = vmull_s16(*x_128++, *y_128++);
-    //tmp_re1 = [Re(x1[1])Re(x2[1]) Im(x1[1])Im(x2[1]) Re(x1[1])Re(x2[2]) Im(x1[1])Im(x2[2])] 
-    tmp_re  = vcombine_s32(vpadd_s32(vget_low_s32(tmp_re),vget_high_s32(tmp_re)),
-                           vpadd_s32(vget_low_s32(tmp_re1),vget_high_s32(tmp_re1)));
-    //tmp_re = [Re(ch[0])Re(rx[0])+Im(ch[0])Im(ch[0]) Re(ch[1])Re(rx[1])+Im(ch[1])Im(ch[1]) Re(ch[2])Re(rx[2])+Im(ch[2]) Im(ch[2]) Re(ch[3])Re(rx[3])+Im(ch[3])Im(ch[3])] 
-
-    tmp_im  = vmull_s16(vrev32_s16(vmul_s16(*x_128++,*(int16x4_t*)conjug)),*y_128++);
-    //tmp_im = [-Im(ch[0])Re(rx[0]) Re(ch[0])Im(rx[0]) -Im(ch[1])Re(rx[1]) Re(ch[1])Im(rx[1])]
-    tmp_im1 = vmull_s16(vrev32_s16(vmul_s16(*x_128++,*(int16x4_t*)conjug)),*y_128++);
-    //tmp_im1 = [-Im(ch[2])Re(rx[2]) Re(ch[2])Im(rx[2]) -Im(ch[3])Re(rx[3]) Re(ch[3])Im(rx[3])]
-    tmp_im  = vcombine_s32(vpadd_s32(vget_low_s32(tmp_im),vget_high_s32(tmp_im)),
-                           vpadd_s32(vget_low_s32(tmp_im1),vget_high_s32(tmp_im1)));
-    //tmp_im = [-Im(ch[0])Re(rx[0])+Re(ch[0])Im(rx[0]) -Im(ch[1])Re(rx[1])+Re(ch[1])Im(rx[1]) -Im(ch[2])Re(rx[2])+Re(ch[2])Im(rx[2]) -Im(ch[3])Re(rx[3])+Re(ch[3])Im(rx[3])]
-
-    re_cumul = vqaddq_s32(re_cumul,vqshlq_s32(tmp_re,shift));
-    im_cumul = vqaddq_s32(im_cumul,vqshlq_s32(tmp_im,shift));
-  }
-  
-  re_cumul2 = vpadd_s32(vget_low_s32(re_cumul),vget_high_s32(re_cumul));
-  im_cumul2 = vpadd_s32(vget_low_s32(im_cumul),vget_high_s32(im_cumul));
-  re_cumul2 = vpadd_s32(re_cumul2,re_cumul2);
-  im_cumul2 = vpadd_s32(im_cumul2,im_cumul2);
-  result2   = vzip_s32(re_cumul2,im_cumul2);
-  return(vget_lane_s32(result2.val[0],0));
-#endif
-}
-
-
-int64_t dot_product64(int16_t *x,
-                      int16_t *y,
-                      uint32_t N, //must be a multiple of 8
-                      uint8_t output_shift)
+/*! \brief Complex number dot_product
+  WARNING: the OAI historical name is dot_product but it is not: it is sum(x*y) not, sum(x*conjugate(y))
+*/
+
+c32_t dot_product(const c16_t *x,//! input vector
+                  const c16_t *y,//! input vector
+                  const uint32_t N,//! size of the vectors
+                  const int output_shift //! normalization afer int16 multiplication
+                  )
 {
-  uint32_t n;
-
-#if defined(__x86_64__) || defined(__i386__)
-  __m128i *x128,*y128,mmtmp1,mmtmp2,mmtmp3,mmcumul,mmcumul_re,mmcumul_im;
-  __m128i minus_i = _mm_set_epi16(-1,1,-1,1,-1,1,-1,1);
-  int64_t result;
-
-  x128 = (__m128i*) x;
-  y128 = (__m128i*) y;
-
-  mmcumul_re = _mm_setzero_si128();
-  mmcumul_im = _mm_setzero_si128();
-
-  for (n=0; n<(N>>2); n++) {
-
-//    printf("n=%d, x128=%p, y128=%p\n",n,x128,y128);
-  //      print_shorts("x",&x128[0]);
-    //    print_shorts("y",&y128[0]);
-
-    // this computes Re(z) = Re(x)*Re(y) + Im(x)*Im(y)
-    mmtmp1 = _mm_madd_epi16(x128[0],y128[0]);
-      //  print_ints("retmp",&mmtmp1);
-    // mmtmp1 contains real part of 4 consecutive outputs (32-bit)
-
-    // shift and accumulate results
-    mmtmp1 = _mm_srai_epi32(mmtmp1,output_shift);
-    mmcumul_re = _mm_add_epi32(mmcumul_re,mmtmp1);
-        //print_ints("re",&mmcumul_re);
-
-
-    // this computes Im(z) = Re(x)*Im(y) - Re(y)*Im(x)
-    mmtmp2 = _mm_shufflelo_epi16(y128[0],_MM_SHUFFLE(2,3,0,1));
-        //print_shorts("y",&mmtmp2);
-    mmtmp2 = _mm_shufflehi_epi16(mmtmp2,_MM_SHUFFLE(2,3,0,1));
-        //print_shorts("y",&mmtmp2);
-    mmtmp2 = _mm_sign_epi16(mmtmp2,minus_i);
-          //  print_shorts("y",&mmtmp2);
-
-    mmtmp3 = _mm_madd_epi16(x128[0],mmtmp2);
-            //print_ints("imtmp",&mmtmp3);
-    // mmtmp3 contains imag part of 4 consecutive outputs (32-bit)
-
-    // shift and accumulate results
-    mmtmp3 = _mm_srai_epi32(mmtmp3,output_shift);
-    mmcumul_im = _mm_add_epi32(mmcumul_im,mmtmp3);
-        //print_ints("im",&mmcumul_im);
-
-    x128++;
-    y128++;
+  const int16_t reflip[32] __attribute__((aligned(32))) = {1, -1, 1, -1, 1, -1, 1, -1, 1, -1, 1, -1, 1, -1, 1, -1};
+  const int8_t imshuffle[64] __attribute__((aligned(32))) = {2, 3, 0, 1, 6, 7, 4, 5, 10, 11, 8, 9, 14, 15, 12, 13, 18, 19, 16, 17, 22, 23, 20, 21, 26, 27, 24, 25, 30, 31, 28, 29};
+  const c16_t *end = x + N;
+  __m256i cumul_re = {0}, cumul_im = {0};
+  while (x < end) {
+    const __m256i in1 = simde_mm256_loadu_si256((__m256i *)x);
+    const __m256i in2 = simde_mm256_loadu_si256((__m256i *)y);
+    const __m256i tmpRe = simde_mm256_madd_epi16(in1, in2);
+    cumul_re = simde_mm256_add_epi32(cumul_re, simde_mm256_srai_epi32(tmpRe, output_shift));
+    const __m256i tmp1 = simde_mm256_shuffle_epi8(in2, *(__m256i *)imshuffle);
+    const __m256i tmp2 = simde_mm256_sign_epi16(tmp1, *(__m256i *)reflip);
+    const __m256i tmpIm = simde_mm256_madd_epi16(in1, tmp2);
+    cumul_im = simde_mm256_add_epi32(cumul_im, simde_mm256_srai_epi32(tmpIm, output_shift));
+    x += 8;
+    y += 8;
  }

-  // this gives Re Re Im Im
-  mmcumul = _mm_hadd_epi32(mmcumul_re,mmcumul_im);
-    //print_ints("cumul1",&mmcumul);
-
-  // this gives Re Im Re Im
-  mmcumul = _mm_hadd_epi32(mmcumul,mmcumul);
-
-    //print_ints("cumul2",&mmcumul);
-
-
-  //mmcumul = _mm_srai_epi32(mmcumul,output_shift);
-  // extract the lower half
-  result = _mm_extract_epi64(mmcumul,0);
-  //printf("result: (%d,%d)\n",((int32_t*)&result)[0],((int32_t*)&result)[1]); 
-  _mm_empty();
-  _m_empty();
- 
-  return(result);
-#elif defined(__arm__) || defined(__aarch64__)
-  int16x4_t *x_128=(int16x4_t*)x;
-  int16x4_t *y_128=(int16x4_t*)y;
-  int32x4_t tmp_re,tmp_im;
-  int32x4_t tmp_re1,tmp_im1;
-  int32x4_t re_cumul,im_cumul;
-  int32x2_t re_cumul2,im_cumul2;
-  int32x4_t shift = vdupq_n_s32(-output_shift); 
-  int32x2x2_t result2;
-  int16_t conjug[4]__attribute__((aligned(16))) = {-1,1,-1,1} ;
-
-  re_cumul = vdupq_n_s32(0);
-  im_cumul = vdupq_n_s32(0); 
-
-  for (n=0; n<(N>>2); n++) {
-
-    tmp_re  = vmull_s16(*x_128++, *y_128++);
-    //tmp_re = [Re(x[0])Re(y[0]) Im(x[0])Im(y[0]) Re(x[1])Re(y[1]) Im(x[1])Im(y[1])] 
-    tmp_re1 = vmull_s16(*x_128++, *y_128++);
-    //tmp_re1 = [Re(x1[1])Re(x2[1]) Im(x1[1])Im(x2[1]) Re(x1[1])Re(x2[2]) Im(x1[1])Im(x2[2])] 
-    tmp_re  = vcombine_s32(vpadd_s32(vget_low_s32(tmp_re),vget_high_s32(tmp_re)),
-                           vpadd_s32(vget_low_s32(tmp_re1),vget_high_s32(tmp_re1)));
-    //tmp_re = [Re(ch[0])Re(rx[0])+Im(ch[0])Im(ch[0]) Re(ch[1])Re(rx[1])+Im(ch[1])Im(ch[1]) Re(ch[2])Re(rx[2])+Im(ch[2]) Im(ch[2]) Re(ch[3])Re(rx[3])+Im(ch[3])Im(ch[3])] 
-
-    tmp_im  = vmull_s16(vrev32_s16(vmul_s16(*x_128++,*(int16x4_t*)conjug)),*y_128++);
-    //tmp_im = [-Im(ch[0])Re(rx[0]) Re(ch[0])Im(rx[0]) -Im(ch[1])Re(rx[1]) Re(ch[1])Im(rx[1])]
-    tmp_im1 = vmull_s16(vrev32_s16(vmul_s16(*x_128++,*(int16x4_t*)conjug)),*y_128++);
-    //tmp_im1 = [-Im(ch[2])Re(rx[2]) Re(ch[2])Im(rx[2]) -Im(ch[3])Re(rx[3]) Re(ch[3])Im(rx[3])]
-    tmp_im  = vcombine_s32(vpadd_s32(vget_low_s32(tmp_im),vget_high_s32(tmp_im)),
-                           vpadd_s32(vget_low_s32(tmp_im1),vget_high_s32(tmp_im1)));
-    //tmp_im = [-Im(ch[0])Re(rx[0])+Re(ch[0])Im(rx[0]) -Im(ch[1])Re(rx[1])+Re(ch[1])Im(rx[1]) -Im(ch[2])Re(rx[2])+Re(ch[2])Im(rx[2]) -Im(ch[3])Re(rx[3])+Re(ch[3])Im(rx[3])]
-
-    re_cumul = vqaddq_s32(re_cumul,vqshlq_s32(tmp_re,shift));
-    im_cumul = vqaddq_s32(im_cumul,vqshlq_s32(tmp_im,shift));
+  // this gives Re Re Im Im Re Re Im Im
+  const __m256i cumulTmp = simde_mm256_hadd_epi32(cumul_re, cumul_im);
+  const __m256i cumul = simde_mm256_hadd_epi32(cumulTmp, cumulTmp);
+
+  c32_t ret;
+  ret.r = simde_mm256_extract_epi32(cumul, 0) + simde_mm256_extract_epi32(cumul, 4);
+  ret.i = simde_mm256_extract_epi32(cumul, 1) + simde_mm256_extract_epi32(cumul, 5);
+  if (x!=end) {
+    x-=8;
+    y-=8;
+    for ( ; x <end; x++,y++ ) {
+      ret.r += ((x->r*y->r)>>output_shift) + ((x->i*y->i)>>output_shift);
+      ret.i += ((x->r*y->i)>>output_shift) - ((x->i*y->r)>>output_shift);
+    }
  }
-  
-  re_cumul2 = vpadd_s32(vget_low_s32(re_cumul),vget_high_s32(re_cumul));
-  im_cumul2 = vpadd_s32(vget_low_s32(im_cumul),vget_high_s32(im_cumul));
-  re_cumul2 = vpadd_s32(re_cumul2,re_cumul2);
-  im_cumul2 = vpadd_s32(im_cumul2,im_cumul2);
-  result2   = vzip_s32(re_cumul2,im_cumul2);
-  return(vget_lane_s32(result2.val[0],0));
-#endif
+  return ret;
 }

-
 #ifdef MAIN
 void print_bytes(char *s,__m128i *x)
 {

--- a/openair1/PHY/TOOLS/tools_defs.h
+++ b/openair1/PHY/TOOLS/tools_defs.h
@@ -742,16 +742,10 @@ int32_t phy_phase_compensation_top(uint32_t pilot_type,
                                   uint32_t last_pilot,
                                   int32_t ignore_prefix);

-int32_t dot_product(int16_t *x,
-                    int16_t *y,
-                    uint32_t N, //must be a multiple of 8
-                    uint8_t output_shift);
-
-int64_t dot_product64(int16_t *x,
-                      int16_t *y,
-                      uint32_t N, //must be a multiple of 8
-                      uint8_t output_shift);
-
+c32_t dot_product(const c16_t *x,
+                  const c16_t *y,
+                  const uint32_t N, // must be a multiple of 8
+                  const int output_shift);

 /** @} */