To use c16maddShift(), c32x16maddShift(), c16multaddVectRealComplex(), and...

To use c16maddShift(), c32x16maddShift(), c16multaddVectRealComplex(), and c16x32div() functions in nr_pdsch_channel_estimation()

To use c16maddShift(), c32x16maddShift(), c16multaddVectRealComplex(), and...
To use c16maddShift(), c32x16maddShift(), c16multaddVectRealComplex(), and c16x32div() functions in nr_pdsch_channel_estimation()
0ab5b358 · Roberto Louro Magueta · laurent · e06704a8 · 0ab5b358 · 0ab5b358
Commit 0ab5b358 authored Jan 10, 2023 by Roberto Louro Magueta Committed by laurent Feb 27, 2023
Showing with 290 additions and 557 deletions

openair1/PHY/NR_UE_ESTIMATION/nr_dl_channel_estimation.c openair1/PHY/NR_UE_ESTIMATION/nr_dl_channel_estimation.c +276 -557

openair1/PHY/TOOLS/tools_defs.h openair1/PHY/TOOLS/tools_defs.h +14 -0

No files found.
--- a/openair1/PHY/NR_UE_ESTIMATION/nr_dl_channel_estimation.c
+++ b/openair1/PHY/NR_UE_ESTIMATION/nr_dl_channel_estimation.c
@@ -1054,7 +1054,7 @@ void nr_pdcch_channel_estimation(PHY_VARS_NR_UE *ue,
    rxF   = (int16_t *)&rxdataF[aarx][(symbol_offset+k+1)];
    dl_ch = (int16_t *)&pdcch_dl_ch_estimates[aarx][ch_offset];
-    memset(dl_ch,0,4*(ue->frame_parms.ofdm_symbol_size));
+    memset(dl_ch, 0, sizeof(*dl_ch) * ue->frame_parms.ofdm_symbol_size);
 #ifdef DEBUG_PDCCH
    printf("pdcch ch est pilot addr %p RB_DL %d\n",&pilot[dmrs_ref*3], ue->frame_parms.N_RB_DL);
@@ -1247,31 +1247,18 @@ int nr_pdsch_channel_estimation(PHY_VARS_NR_UE *ue,
 {
  int gNB_id = proc->gNB_id;
  int Ns = proc->nr_slot_rx;
-  int pilot[3280] __attribute__((aligned(16)));
+  const int ch_offset = ue->frame_parms.ofdm_symbol_size * symbol;
-  unsigned char aarx;
+  const int symbol_offset = ue->frame_parms.ofdm_symbol_size * symbol;
-  unsigned short k;
-  unsigned int pilot_cnt;
-  int16_t ch_l[2],ch_r[2],ch[2],*pil,*rxF,*dl_ch;
-  int16_t *fdcl = NULL, *fdcr = NULL, *fdclh = NULL, *fdcrh = NULL;
-  int ch_offset,symbol_offset;
-  uint8_t nushift;
-  ch_offset     = ue->frame_parms.ofdm_symbol_size*symbol;
-  symbol_offset = ue->frame_parms.ofdm_symbol_size*symbol;
-  k = bwp_start_subcarrier;
-  int re_offset = k;
 #ifdef DEBUG_PDSCH
-  printf("PDSCH Channel Estimation : gNB_id %d ch_offset %d, symbol_offset %d OFDM size %d, Ncp=%d, Ns=%d, k=%d symbol %d\n",
+  printf("PDSCH Channel Estimation : gNB_id %d ch_offset %d, symbol_offset %d OFDM size %d, Ncp=%d, Ns=%d, bwp_start_subcarrier=%d symbol %d\n",
         gNB_id,
         ch_offset,
         symbol_offset,
         ue->frame_parms.ofdm_symbol_size,
         ue->frame_parms.Ncp,
         Ns,
-         k,
+         bwp_start_subcarrier,
         symbol);
 #endif
@@ -1283,26 +1270,36 @@ int nr_pdsch_channel_estimation(PHY_VARS_NR_UE *ue,
  int8_t delta = get_delta(p, config_type);
  // checking if re-initialization of scrambling IDs is needed
-  if (scrambling_id != ue->scramblingID_dlsch[nscid]){
+  if (scrambling_id != ue->scramblingID_dlsch[nscid]) {
    ue->scramblingID_dlsch[nscid] = scrambling_id;
    nr_gold_pdsch(ue, nscid, scrambling_id);
  }
-  nr_pdsch_dmrs_rx(ue, Ns, ue->nr_gold_pdsch[gNB_id][Ns][symbol][0], &pilot[0], 1000+p, 0, nb_rb_pdsch+rb_offset, config_type);
+  c16_t pilot[3280] __attribute__((aligned(16)));
+  nr_pdsch_dmrs_rx(ue, Ns, ue->nr_gold_pdsch[gNB_id][Ns][symbol][0], (int32_t *)pilot, 1000 + p, 0, nb_rb_pdsch + rb_offset, config_type);
-  if (config_type == NFAPI_NR_DMRS_TYPE1){
+  int16_t *fdcl = NULL;
-    nushift = (p>>1)&1;
+  int16_t *fdcr = NULL;
-    if (p<4) ue->frame_parms.nushift = nushift;
+  int16_t *fdclh = NULL;
-    switch (delta) {
+  int16_t *fdcrh = NULL;
+  uint8_t nushift = 0;
+  if (config_type == NFAPI_NR_DMRS_TYPE1) {
+    nushift = (p >> 1) & 1;
-    case 0://port 0,1
+    if (p < 4)
-      fdcl = filt8_dcl0; //left DC interpolation Filter (even RB)
+      ue->frame_parms.nushift = nushift;
-      fdcr = filt8_dcr0; //right DC interpolation Filter (even RB)
-      fdclh = filt8_dcl0_h; //left DC interpolation Filter (odd RB)
+    switch (delta) {
-      fdcrh = filt8_dcr0_h; //right DC interpolation Filter (odd RB)
+      case 0: // port 0,1
+      fdcl = filt8_dcl0; // left DC interpolation Filter (even RB)
+      fdcr = filt8_dcr0; // right DC interpolation Filter (even RB)
+      fdclh = filt8_dcl0_h; // left DC interpolation Filter (odd RB)
+      fdcrh = filt8_dcr0_h; // right DC interpolation Filter (odd RB)
      break;
-    case 1://port2,3
+      case 1: // port2,3
      fdcl = filt8_dcl1;
      fdcr = filt8_dcr1;
      fdclh = filt8_dcl1_h;
@@ -1310,22 +1307,27 @@ int nr_pdsch_channel_estimation(PHY_VARS_NR_UE *ue,
      break;
      default:
-      LOG_E(PHY,"pdsch_channel_estimation: nushift=%d -> ERROR\n",nushift);
+      LOG_E(PHY, "pdsch_channel_estimation: nushift=%d -> ERROR\n", nushift);
      return -1;
      break;
    }
-  } else {//NFAPI_NR_DMRS_TYPE2
+  } else { // NFAPI_NR_DMRS_TYPE2
    nushift = delta;
-    if (p<6) ue->frame_parms.nushift = nushift;
+    if (p < 6)
+      ue->frame_parms.nushift = nushift;
    switch (delta) {
-    case 0://port 0,1
+      case 0: // ports 0,1
      fdcl = filt8_dcl1;
      fdcr = filt8_dcr1;
      fdclh = filt8_dcl1_h;
      fdcrh = filt8_dcr1_h;
      break;
-    case 2://port2,3
+      case 2: // ports 2,3
      fdcl = NULL;
      fdcr = NULL;
      fdclh = NULL;
@@ -1333,13 +1335,13 @@ int nr_pdsch_channel_estimation(PHY_VARS_NR_UE *ue,
      break;
      default:
-      LOG_E(PHY,"pdsch_channel_estimation: nushift=%d -> ERROR\n",nushift);
+      LOG_E(PHY, "pdsch_channel_estimation: nushift=%d -> ERROR\n", nushift);
      return -1;
      break;
    }
  }
-  for (aarx = 0; aarx < ue->frame_parms.nb_antennas_rx; aarx++) {
+  for (int aarx = 0; aarx < ue->frame_parms.nb_antennas_rx; aarx++) {
 #ifdef DEBUG_PDSCH
    printf("\n============================================\n");
@@ -1347,615 +1349,332 @@ int nr_pdsch_channel_estimation(PHY_VARS_NR_UE *ue,
    printf("============================================\n");
 #endif
-    pil = (int16_t *)&pilot[rb_offset * ((config_type == NFAPI_NR_DMRS_TYPE1) ? 6 : 4)];
+    int re_offset = bwp_start_subcarrier % ue->frame_parms.ofdm_symbol_size;
-    k = k % ue->frame_parms.ofdm_symbol_size;
+    c16_t *pil = &pilot[rb_offset * ((config_type == NFAPI_NR_DMRS_TYPE1) ? 6 : 4)];
-    re_offset = k;
+    c16_t *rxF = &rxdataF[aarx][(symbol_offset + re_offset + nushift)];
-    rxF = (int16_t *)&rxdataF[aarx][(symbol_offset + re_offset + nushift)];
+    c16_t *dl_ch = (c16_t *)&dl_ch_estimates[p * ue->frame_parms.nb_antennas_rx + aarx][ch_offset];
-    dl_ch = (int16_t *)&dl_ch_estimates[p * ue->frame_parms.nb_antennas_rx + aarx][ch_offset];
+    memset(dl_ch, 0, sizeof(*dl_ch) * ue->frame_parms.ofdm_symbol_size);
-    memset(dl_ch, 0, 4 * (ue->frame_parms.ofdm_symbol_size));
+    c16_t ch = {0};
    if (config_type == NFAPI_NR_DMRS_TYPE1 && ue->chest_freq == 0) {
-      for (pilot_cnt = 0; pilot_cnt < 6 * nb_rb_pdsch; pilot_cnt++) {
+      for (int pilot_cnt = 0; pilot_cnt < 6 * nb_rb_pdsch; pilot_cnt++) {
        if (pilot_cnt % 2 == 0) {
-          ch[0] = (int16_t)(((int32_t)pil[0] * rxF[0] - (int32_t)pil[1] * rxF[1]) >> 15);
+          ch = c16mulShift(*pil, *rxF, 15);
-          ch[1] = (int16_t)(((int32_t)pil[0] * rxF[1] + (int32_t)pil[1] * rxF[0]) >> 15);
+          pil++;
-          pil += 2;
          re_offset = (re_offset + 2) % ue->frame_parms.ofdm_symbol_size;
-          rxF = (int16_t *)&rxdataF[aarx][(symbol_offset + nushift + re_offset)];
+          rxF = &rxdataF[aarx][(symbol_offset + nushift + re_offset)];
-          ch[0] += (int16_t)(((int32_t)pil[0] * rxF[0] - (int32_t)pil[1] * rxF[1]) >> 15);
+          ch = c16maddShift(*pil, *rxF, ch, 15);
-          ch[1] += (int16_t)(((int32_t)pil[0] * rxF[1] + (int32_t)pil[1] * rxF[0]) >> 15);
+          ch = c16Shift(ch, 1);
-          ch[0] >>= 1;
+          pil++;
-          ch[1] >>= 1;
-          pil += 2;
          re_offset = (re_offset + 2) % ue->frame_parms.ofdm_symbol_size;
-          rxF = (int16_t *)&rxdataF[aarx][(symbol_offset + nushift + re_offset)];
+          rxF = &rxdataF[aarx][(symbol_offset + nushift + re_offset)];
        }
 #ifdef DEBUG_PDSCH
-        printf("pilot %3u: pil -> (%6d,%6d), rxF -> (%4d,%4d), ch -> (%4d,%4d) \n", pilot_cnt, pil[0], pil[1], rxF[0], rxF[1], ch[0], ch[1]);
+      printf("pilot %3u: pil -> (%6d,%6d), rxF -> (%4d,%4d), ch -> (%4d,%4d) \n", pilot_cnt, pil->r, pil->i, rxF->r, rxF->i, ch.r, ch.i);
 #endif
        if (pilot_cnt == 0) { // Treat first pilot
-          multadd_real_vector_complex_scalar(filt16_dl_first, ch, dl_ch, 16);
+          c16multaddVectRealComplex(filt16_dl_first, &ch, dl_ch, 16);
        } else if (pilot_cnt == 6 * nb_rb_pdsch - 1) { // Treat last pilot
-          multadd_real_vector_complex_scalar(filt16_dl_last, ch, dl_ch, 16);
+          c16multaddVectRealComplex(filt16_dl_last, &ch, dl_ch, 16);
        } else { // Treat middle pilots
-          multadd_real_vector_complex_scalar(filt16_dl_middle, ch, dl_ch, 16);
+          c16multaddVectRealComplex(filt16_dl_middle, &ch, dl_ch, 16);
          if (pilot_cnt % 2 == 0) {
-            dl_ch += 8;
+            dl_ch += 4;
          }
        }
      }
      // check if PRB crosses DC and improve estimates around DC
-      if ((bwp_start_subcarrier < ue->frame_parms.ofdm_symbol_size) && (bwp_start_subcarrier+nb_rb_pdsch*12 >= ue->frame_parms.ofdm_symbol_size)) {
+      if ((bwp_start_subcarrier < ue->frame_parms.ofdm_symbol_size) && (bwp_start_subcarrier + nb_rb_pdsch * 12 >= ue->frame_parms.ofdm_symbol_size)) {
-        dl_ch = (int16_t *)&dl_ch_estimates[aarx][ch_offset];
+        dl_ch = (c16_t *)&dl_ch_estimates[aarx][ch_offset];
-        uint16_t idxDC = 2*(ue->frame_parms.ofdm_symbol_size - bwp_start_subcarrier);
+        uint16_t idxDC = 2 * (ue->frame_parms.ofdm_symbol_size - bwp_start_subcarrier);
-        uint16_t idxPil = idxDC/2;
+        uint16_t idxPil = idxDC / 4;
-        re_offset = k;
+        re_offset = bwp_start_subcarrier % ue->frame_parms.ofdm_symbol_size;
-        pil = (int16_t *)&pilot[rb_offset*((config_type == NFAPI_NR_DMRS_TYPE1) ? 6:4)];
+        pil = &pilot[rb_offset * ((config_type == NFAPI_NR_DMRS_TYPE1) ? 6 : 4)];
-        pil += (idxPil-2);
+        pil += (idxPil - 1);
-        dl_ch += (idxDC-4);
+        dl_ch += (idxDC / 2 - 2);
-        dl_ch = memset(dl_ch, 0, sizeof(int16_t)*10);
+        dl_ch = memset(dl_ch, 0, sizeof(c16_t) * 5);
-        re_offset = (re_offset+idxDC/2-2) % ue->frame_parms.ofdm_symbol_size;
+        re_offset = (re_offset + idxDC / 2 - 2) % ue->frame_parms.ofdm_symbol_size;
-        rxF   = (int16_t *)&rxdataF[aarx][(symbol_offset+nushift+re_offset)];
+        rxF = &rxdataF[aarx][(symbol_offset + nushift + re_offset)];
-        ch[0] = (int16_t)(((int32_t)pil[0]*rxF[0] - (int32_t)pil[1]*rxF[1])>>15);
+        ch = c16mulShift(*pil, *rxF, 15);
-        ch[1] = (int16_t)(((int32_t)pil[0]*rxF[1] + (int32_t)pil[1]*rxF[0])>>15);
+        pil++;
-        pil += 2;
        re_offset = (re_offset + 2) % ue->frame_parms.ofdm_symbol_size;
-        rxF = (int16_t *)&rxdataF[aarx][(symbol_offset + nushift + re_offset)];
+        rxF = &rxdataF[aarx][(symbol_offset + nushift + re_offset)];
-        ch[0] += (int16_t)(((int32_t)pil[0] * rxF[0] - (int32_t)pil[1] * rxF[1]) >> 15);
+        ch = c16maddShift(*pil, *rxF, ch, 15);
-        ch[1] += (int16_t)(((int32_t)pil[0] * rxF[1] + (int32_t)pil[1] * rxF[0]) >> 15);
+        ch = c16Shift(ch, 1);
-        ch[0] >>= 1;
-        ch[1] >>= 1;
        // for proper allignment of SIMD vectors
-        if((ue->frame_parms.N_RB_DL&1) == 0) {
+        if ((ue->frame_parms.N_RB_DL & 1) == 0) {
+          c16multaddVectRealComplex(fdcl, &ch, dl_ch - 2, 8);
-          multadd_real_vector_complex_scalar(fdcl,
+          pil++;
-                     ch,
-                     dl_ch-4,
-                     8);
-          pil += 2;
-          re_offset = (re_offset+2) % ue->frame_parms.ofdm_symbol_size;
-          rxF   = (int16_t *)&rxdataF[aarx][(symbol_offset+nushift+re_offset)];
-          ch[0] = (int16_t)(((int32_t)pil[0]*rxF[0] - (int32_t)pil[1]*rxF[1])>>15);
-          ch[1] = (int16_t)(((int32_t)pil[0]*rxF[1] + (int32_t)pil[1]*rxF[0])>>15);
-          pil += 2;
          re_offset = (re_offset + 2) % ue->frame_parms.ofdm_symbol_size;
-          rxF = (int16_t *)&rxdataF[aarx][(symbol_offset + nushift + re_offset)];
+          rxF = &rxdataF[aarx][(symbol_offset + nushift + re_offset)];
-          ch[0] += (int16_t)(((int32_t)pil[0] * rxF[0] - (int32_t)pil[1] * rxF[1]) >> 15);
+          ch = c16mulShift(*pil, *rxF, 15);
-          ch[1] += (int16_t)(((int32_t)pil[0] * rxF[1] + (int32_t)pil[1] * rxF[0]) >> 15);
+          pil++;
-          ch[0] >>= 1;
+          re_offset = (re_offset + 2) % ue->frame_parms.ofdm_symbol_size;
-          ch[1] >>= 1;
+          rxF = &rxdataF[aarx][(symbol_offset + nushift + re_offset)];
+          ch = c16maddShift(*pil, *rxF, ch, 15);
-          multadd_real_vector_complex_scalar(fdcr,
+          ch = c16Shift(ch, 1);
-                     ch,
+          c16multaddVectRealComplex(fdcr, &ch, dl_ch - 2, 8);
-                     dl_ch-4,
-                     8);
        } else {
+          c16multaddVectRealComplex(fdclh, &ch, dl_ch, 8);
-          multadd_real_vector_complex_scalar(fdclh,
+          pil++;
-                     ch,
-                     dl_ch,
-                     8);
-          pil += 2;
-          re_offset = (re_offset+2) % ue->frame_parms.ofdm_symbol_size;
-          rxF   = (int16_t *)&rxdataF[aarx][(symbol_offset+nushift+re_offset)];
-          ch[0] = (int16_t)(((int32_t)pil[0]*rxF[0] - (int32_t)pil[1]*rxF[1])>>15);
-          ch[1] = (int16_t)(((int32_t)pil[0]*rxF[1] + (int32_t)pil[1]*rxF[0])>>15);
-          pil += 2;
          re_offset = (re_offset + 2) % ue->frame_parms.ofdm_symbol_size;
-          rxF = (int16_t *)&rxdataF[aarx][(symbol_offset + nushift + re_offset)];
+          rxF = &rxdataF[aarx][(symbol_offset + nushift + re_offset)];
-          ch[0] += (int16_t)(((int32_t)pil[0] * rxF[0] - (int32_t)pil[1] * rxF[1]) >> 15);
+          ch = c16mulShift(*pil, *rxF, 15);
-          ch[1] += (int16_t)(((int32_t)pil[0] * rxF[1] + (int32_t)pil[1] * rxF[0]) >> 15);
+          pil++;
-          ch[0] >>= 1;
+          re_offset = (re_offset + 2) % ue->frame_parms.ofdm_symbol_size;
-          ch[1] >>= 1;
+          rxF = &rxdataF[aarx][(symbol_offset + nushift + re_offset)];
+          ch = c16maddShift(*pil, *rxF, ch, 15);
-          multadd_real_vector_complex_scalar(fdcrh,
+          ch = c16Shift(ch, 1);
-                     ch,
+          c16multaddVectRealComplex(fdcrh, &ch, dl_ch, 8);
-                     dl_ch,
-                     8);
        }
      }
-    } else if (config_type == NFAPI_NR_DMRS_TYPE2 && ue->chest_freq == 0){ //pdsch_dmrs_type2  |dmrs_r,dmrs_l,0,0,0,0,dmrs_r,dmrs_l,0,0,0,0|
-      for (pilot_cnt = 0; pilot_cnt < 4 * nb_rb_pdsch; pilot_cnt++) {
+    // pdsch_dmrs_type2  |dmrs_r,dmrs_l,0,0,0,0,dmrs_r,dmrs_l,0,0,0,0|
-        if (pilot_cnt % 2 == 0) {
+    } else if (config_type == NFAPI_NR_DMRS_TYPE2 && ue->chest_freq == 0) {
-          ch_l[0] = (int16_t)(((int32_t)pil[0] * rxF[0] - (int32_t)pil[1] * rxF[1]) >> 15);
-          ch_l[1] = (int16_t)(((int32_t)pil[0] * rxF[1] + (int32_t)pil[1] * rxF[0]) >> 15);
+      c16_t ch_l = {0};
+      c16_t ch_r = {0};
+      for (int pilot_cnt = 0; pilot_cnt < 4 * nb_rb_pdsch; pilot_cnt += 2) {
+        ch_l = c16mulShift(*pil, *rxF, 15);
 #ifdef DEBUG_PDSCH
-          printf("pilot %3u: pil -> (%6d,%6d), rxF -> (%4d,%4d), ch -> (%4d,%4d) \n", pilot_cnt, pil[0], pil[1], rxF[0], rxF[1], ch_l[0], ch_l[1]);
+        printf("pilot %3u: pil -> (%6d,%6d), rxF -> (%4d,%4d), ch -> (%4d,%4d) \n", pilot_cnt, pil->r, pil->i, rxF->r, rxF->i, ch_l.r, ch_l.i);
 #endif
+        pil++;
-        } else {
+        re_offset = (re_offset + 1) % ue->frame_parms.ofdm_symbol_size;
-          ch_r[0] = (int16_t)(((int32_t)pil[0] * rxF[0] - (int32_t)pil[1] * rxF[1]) >> 15);
+        rxF = &rxdataF[aarx][(symbol_offset + nushift + re_offset)];
-          ch_r[1] = (int16_t)(((int32_t)pil[0] * rxF[1] + (int32_t)pil[1] * rxF[0]) >> 15);
+        ch_r = c16mulShift(*pil, *rxF, 15);
 #ifdef DEBUG_PDSCH
-          printf("pilot %3u: pil -> (%6d,%6d), rxF -> (%4d,%4d), ch -> (%4d,%4d) \n", pilot_cnt, pil[0], pil[1], rxF[0], rxF[1], ch_r[0], ch_r[1]);
+        printf("pilot %3u: pil -> (%6d,%6d), rxF -> (%4d,%4d), ch -> (%4d,%4d) \n", pilot_cnt, pil->r, pil->i, rxF->r, rxF->i, ch_r.r, ch_r.i);
 #endif
-          ch[0] = (ch_l[0] + ch_r[0]) >> 1;
+        ch = c16addShift(ch_l, ch_r, 1);
-          ch[1] = (ch_l[1] + ch_r[1]) >> 1;
        if (pilot_cnt == 1) {
-            multadd_real_vector_complex_scalar(filt16_dl_first_type2, ch, dl_ch, 16);
+          multadd_real_vector_complex_scalar(filt16_dl_first_type2, (int16_t *)&ch, (int16_t *)dl_ch, 16);
-            dl_ch += 6;
+          dl_ch += 3;
        } else if (pilot_cnt == (4 * nb_rb_pdsch - 1)) {
-            multadd_real_vector_complex_scalar(filt16_dl_last_type2, ch, dl_ch, 16);
+          multadd_real_vector_complex_scalar(filt16_dl_last_type2, (int16_t *)&ch, (int16_t *)dl_ch, 16);
        } else {
-            multadd_real_vector_complex_scalar(filt16_dl_middle_type2, ch, dl_ch, 16);
+          multadd_real_vector_complex_scalar(filt16_dl_middle_type2, (int16_t *)&ch, (int16_t *)dl_ch, 16);
-            dl_ch += 12;
+          dl_ch += 6;
-          }
        }
-        pil += 2;
+        pil++;
-        int re_offset_add = pilot_cnt % 2 == 0 ? 1 : 5;
+        re_offset = (re_offset + 5) % ue->frame_parms.ofdm_symbol_size;
-        re_offset = (re_offset + re_offset_add) % ue->frame_parms.ofdm_symbol_size;
+        rxF = &rxdataF[aarx][(symbol_offset + nushift + re_offset)];
-        rxF = (int16_t *)&rxdataF[aarx][(symbol_offset + nushift + re_offset)];
      }
      // check if PRB crosses DC and improve estimates around DC
-      if ((bwp_start_subcarrier < ue->frame_parms.ofdm_symbol_size) && (bwp_start_subcarrier+nb_rb_pdsch*12 >= ue->frame_parms.ofdm_symbol_size) && (p<2)) {
+      if ((bwp_start_subcarrier < ue->frame_parms.ofdm_symbol_size) && (bwp_start_subcarrier + nb_rb_pdsch * 12 >= ue->frame_parms.ofdm_symbol_size) && (p < 2)) {
+        dl_ch = (c16_t *)&dl_ch_estimates[p * ue->frame_parms.nb_antennas_rx + aarx][ch_offset];
+        uint16_t idxDC = 2 * (ue->frame_parms.ofdm_symbol_size - bwp_start_subcarrier);
+        dl_ch += (idxDC / 2 - 4);
+        dl_ch = memset(dl_ch, 0, sizeof(c16_t) * 10);
-        dl_ch = (int16_t *)&dl_ch_estimates[p*ue->frame_parms.nb_antennas_rx+aarx][ch_offset];
+        dl_ch--;
-        uint16_t idxDC = 2*(ue->frame_parms.ofdm_symbol_size - bwp_start_subcarrier);
+        ch_r = *dl_ch;
-        dl_ch += (idxDC-8);
+        dl_ch += 11;
-        dl_ch = memset(dl_ch, 0, sizeof(int16_t)*20);
+        ch_l = *dl_ch;
-        dl_ch -= 2;
-        ch_r[0] = dl_ch[0];
-        ch_r[1]= dl_ch[1] ;
-        dl_ch += 22;
-        ch_l[0] = dl_ch[0];
-        ch_l[1]= dl_ch[1] ;
        // for proper allignment of SIMD vectors
-        if((ue->frame_parms.N_RB_DL&1) == 0) {
+        if ((ue->frame_parms.N_RB_DL & 1) == 0) {
-          dl_ch -= 20;
+          dl_ch -= 10;
-          //Interpolate fdcrl1 with ch_r
+          // Interpolate fdcrl1 with ch_r
-          multadd_real_vector_complex_scalar(filt8_dcrl1,
+          c16multaddVectRealComplex(filt8_dcrl1, &ch_r, dl_ch, 8);
-                                             ch_r,
+          // Interpolate fdclh1 with ch_l
-                                             dl_ch,
+          c16multaddVectRealComplex(filt8_dclh1, &ch_l, dl_ch, 8);
-                                             8);
+          dl_ch += 8;
-          //Interpolate fdclh1 with ch_l
+          // Interpolate fdcrh1 with ch_r
-          multadd_real_vector_complex_scalar(filt8_dclh1,
+          c16multaddVectRealComplex(filt8_dcrh1, &ch_r, dl_ch, 8);
-                                             ch_l,
+          // Interpolate fdcll1 with ch_l
-                                             dl_ch,
+          c16multaddVectRealComplex(filt8_dcll1, &ch_l, dl_ch, 8);
-                                             8);
-          dl_ch += 16;
-          //Interpolate fdcrh1 with ch_r
-          multadd_real_vector_complex_scalar(filt8_dcrh1,
-                                             ch_r,
-                                             dl_ch,
-                                             8);
-          //Interpolate fdcll1 with ch_l
-          multadd_real_vector_complex_scalar(filt8_dcll1,
-                                             ch_l,
-                                             dl_ch,
-                                             8);
        } else {
-          dl_ch -= 28;
+          dl_ch -= 14;
-          //Interpolate fdcrl1 with ch_r
+          // Interpolate fdcrl1 with ch_r
-          multadd_real_vector_complex_scalar(filt8_dcrl2,
+          c16multaddVectRealComplex(filt8_dcrl2, &ch_r, dl_ch, 8);
-                                             ch_r,
+          // Interpolate fdclh1 with ch_l
-                                             dl_ch,
+          c16multaddVectRealComplex(filt8_dclh2, &ch_l, dl_ch, 8);
-                                             8);
+          dl_ch += 8;
-          //Interpolate fdclh1 with ch_l
+          // Interpolate fdcrh1 with ch_r
-          multadd_real_vector_complex_scalar(filt8_dclh2,
+          c16multaddVectRealComplex(filt8_dcrh2, &ch_r, dl_ch, 8);
-                                             ch_l,
+          // Interpolate fdcll1 with ch_l
-                                             dl_ch,
+          c16multaddVectRealComplex(filt8_dcll2, &ch_l, dl_ch, 8);
-                                             8);
-          dl_ch += 16;
-          //Interpolate fdcrh1 with ch_r
-          multadd_real_vector_complex_scalar(filt8_dcrh2,
-                                             ch_r,
-                                             dl_ch,
-                                             8);
-          //Interpolate fdcll1 with ch_l
-          multadd_real_vector_complex_scalar(filt8_dcll2,
-                                             ch_l,
-                                             dl_ch,
-                                             8);
-        }
        }
      }
-    else if (config_type == NFAPI_NR_DMRS_TYPE1) { // this is case without frequency-domain linear interpolation, just take average of LS channel estimates of 6 DMRS REs and use a common value for the whole PRB
-      int32_t ch_0, ch_1;
-      ch_0 = ((int32_t)pil[0]*rxF[0] - (int32_t)pil[1]*rxF[1])>>15;
-      ch_1 = ((int32_t)pil[0]*rxF[1] + (int32_t)pil[1]*rxF[0])>>15;
-      pil += 2;
-      re_offset = (re_offset+2) % ue->frame_parms.ofdm_symbol_size;
-      rxF   = (int16_t *)&rxdataF[aarx][(symbol_offset+nushift+re_offset)];
-      ch_0 += ((int32_t)pil[0]*rxF[0] - (int32_t)pil[1]*rxF[1])>>15;
-      ch_1 += ((int32_t)pil[0]*rxF[1] + (int32_t)pil[1]*rxF[0])>>15;
-      pil += 2;
-      re_offset = (re_offset+2) % ue->frame_parms.ofdm_symbol_size;
-      rxF   = (int16_t *)&rxdataF[aarx][(symbol_offset+nushift+re_offset)];
-      ch_0 += ((int32_t)pil[0]*rxF[0] - (int32_t)pil[1]*rxF[1])>>15;
-      ch_1 += ((int32_t)pil[0]*rxF[1] + (int32_t)pil[1]*rxF[0])>>15;
-      pil += 2;
-      re_offset = (re_offset+2) % ue->frame_parms.ofdm_symbol_size;
-      rxF   = (int16_t *)&rxdataF[aarx][(symbol_offset+nushift+re_offset)];
-      ch_0 += ((int32_t)pil[0]*rxF[0] - (int32_t)pil[1]*rxF[1])>>15;
-      ch_1 += ((int32_t)pil[0]*rxF[1] + (int32_t)pil[1]*rxF[0])>>15;
-      pil += 2;
-      re_offset = (re_offset+2) % ue->frame_parms.ofdm_symbol_size;
-      rxF   = (int16_t *)&rxdataF[aarx][(symbol_offset+nushift+re_offset)];
-      ch_0 += ((int32_t)pil[0]*rxF[0] - (int32_t)pil[1]*rxF[1])>>15;
-      ch_1 += ((int32_t)pil[0]*rxF[1] + (int32_t)pil[1]*rxF[0])>>15;
-      pil += 2;
-      re_offset = (re_offset+2) % ue->frame_parms.ofdm_symbol_size;
-      rxF   = (int16_t *)&rxdataF[aarx][(symbol_offset+nushift+re_offset)];
-      ch_0 += ((int32_t)pil[0]*rxF[0] - (int32_t)pil[1]*rxF[1])>>15;
+    // this is case without frequency-domain linear interpolation, just take average of LS channel estimates of 6 DMRS REs and use a common value for the whole PRB
-      ch_1 += ((int32_t)pil[0]*rxF[1] + (int32_t)pil[1]*rxF[0])>>15;
+    } else if (config_type == NFAPI_NR_DMRS_TYPE1) {
-      pil += 2;
+      int P_average = 6;
-      re_offset = (re_offset+2) % ue->frame_parms.ofdm_symbol_size;
+      c32_t ch32 = {0};
-      rxF   = (int16_t *)&rxdataF[aarx][(symbol_offset+nushift+re_offset)];
+      for (int p_av = 0; p_av < P_average; p_av++) {
+        ch32 = c32x16maddShift(*pil, *rxF, ch32, 15);
-      ch[0] = ch_0 / 6;
+        pil++;
-      ch[1] = ch_1 / 6;
+        re_offset = (re_offset + 2) % ue->frame_parms.ofdm_symbol_size;
+        rxF = &rxdataF[aarx][(symbol_offset + nushift + re_offset)];
+      }
+      ch = c16x32div(ch32, P_average);
 #if NO_INTERP
-      for (int i=0;i<12;i++) ((int32_t*)dl_ch)[i] = *(int32_t*)ch;
+      for (int i = 0; i < 2 * P_average; i++) {
-      dl_ch+=24;
+        dl_ch[i] = ch;
+      }
+      dl_ch += 2 * P_average;
 #else
-      multadd_real_vector_complex_scalar(filt8_avlip0,
+      c16multaddVectRealComplex(filt8_avlip0, &ch, dl_ch, 8);
-                                         ch,
-                                         dl_ch,
-                                         8);
      dl_ch += 16;
-      multadd_real_vector_complex_scalar(filt8_avlip1,
+      c16multaddVectRealComplex(filt8_avlip1, &ch, dl_ch, 8);
-                                         ch,
-                                         dl_ch,
-                                         8);
      dl_ch += 16;
-      multadd_real_vector_complex_scalar(filt8_avlip2,
+      c16multaddVectRealComplex(filt8_avlip2, &ch, dl_ch, 8);
-                                         ch,
-                                         dl_ch,
-                                         8);
      dl_ch -= 24;
 #endif
-      for (pilot_cnt=6; pilot_cnt<6*(nb_rb_pdsch-1); pilot_cnt += 6) {
+      for (int pilot_cnt = P_average; pilot_cnt < 6 * (nb_rb_pdsch - 1); pilot_cnt += P_average) {
+        ch32.r = 0;
-        ch_0 = ((int32_t)pil[0]*rxF[0] - (int32_t)pil[1]*rxF[1])>>15;
+        ch32.i = 0;
-        ch_1 = ((int32_t)pil[0]*rxF[1] + (int32_t)pil[1]*rxF[0])>>15;
+        for (int p_av = 0; p_av < P_average; p_av++) {
+          ch32 = c32x16maddShift(*pil, *rxF, ch32, 15);
-        pil += 2;
+          pil++;
-        re_offset = (re_offset+2) % ue->frame_parms.ofdm_symbol_size;
+          re_offset = (re_offset + 2) % ue->frame_parms.ofdm_symbol_size;
-        rxF   = (int16_t *)&rxdataF[aarx][(symbol_offset+nushift+re_offset)];
+          rxF = &rxdataF[aarx][(symbol_offset + nushift + re_offset)];
+        }
-        ch_0 += ((int32_t)pil[0]*rxF[0] - (int32_t)pil[1]*rxF[1])>>15;
+        ch = c16x32div(ch32, P_average);
-        ch_1 += ((int32_t)pil[0]*rxF[1] + (int32_t)pil[1]*rxF[0])>>15;
-        pil += 2;
-        re_offset = (re_offset+2) % ue->frame_parms.ofdm_symbol_size;
-        rxF   = (int16_t *)&rxdataF[aarx][(symbol_offset+nushift+re_offset)];
-        ch_0 += ((int32_t)pil[0]*rxF[0] - (int32_t)pil[1]*rxF[1])>>15;
-        ch_1 += ((int32_t)pil[0]*rxF[1] + (int32_t)pil[1]*rxF[0])>>15;
-        pil += 2;
-        re_offset = (re_offset+2) % ue->frame_parms.ofdm_symbol_size;
-        rxF   = (int16_t *)&rxdataF[aarx][(symbol_offset+nushift+re_offset)];
-        ch_0 += ((int32_t)pil[0]*rxF[0] - (int32_t)pil[1]*rxF[1])>>15;
-        ch_1 += ((int32_t)pil[0]*rxF[1] + (int32_t)pil[1]*rxF[0])>>15;
-        pil += 2;
-        re_offset = (re_offset+2) % ue->frame_parms.ofdm_symbol_size;
-        rxF   = (int16_t *)&rxdataF[aarx][(symbol_offset+nushift+re_offset)];
-        ch_0 += ((int32_t)pil[0]*rxF[0] - (int32_t)pil[1]*rxF[1])>>15;
-        ch_1 += ((int32_t)pil[0]*rxF[1] + (int32_t)pil[1]*rxF[0])>>15;
-        pil += 2;
-        re_offset = (re_offset+2) % ue->frame_parms.ofdm_symbol_size;
-        rxF   = (int16_t *)&rxdataF[aarx][(symbol_offset+nushift+re_offset)];
-        ch_0 += ((int32_t)pil[0]*rxF[0] - (int32_t)pil[1]*rxF[1])>>15;
-        ch_1 += ((int32_t)pil[0]*rxF[1] + (int32_t)pil[1]*rxF[0])>>15;
-        pil += 2;
-        re_offset = (re_offset+2) % ue->frame_parms.ofdm_symbol_size;
-        rxF   = (int16_t *)&rxdataF[aarx][(symbol_offset+nushift+re_offset)];
-        ch[0] = ch_0 / 6;
-        ch[1] = ch_1 / 6;
 #if NO_INTERP
-        for (int i=0;i<12;i++) ((int32_t*)dl_ch)[i] = *(int32_t*)ch;
+        for (int i = 0; i < 2 * P_average; i++) {
-        dl_ch+=24;
+          dl_ch[i] = ch;
+        }
+        dl_ch += 2 * P_average;
 #else
-        dl_ch[6] += (ch[0] * 1365)>>15; // 1/12*16384
+        dl_ch[3].r += (ch.r * 1365) >> 15; // 1/12*16384
-        dl_ch[7] += (ch[1] * 1365)>>15; // 1/12*16384
+        dl_ch[3].i += (ch.i * 1365) >> 15; // 1/12*16384
+        dl_ch += 4;
+        c16multaddVectRealComplex(filt8_avlip3, &ch, dl_ch, 8);
        dl_ch += 8;
-        multadd_real_vector_complex_scalar(filt8_avlip3,
+        c16multaddVectRealComplex(filt8_avlip4, &ch, dl_ch, 8);
-                                           ch,
+        dl_ch += 8;
-                                           dl_ch,
+        c16multaddVectRealComplex(filt8_avlip5, &ch, dl_ch, 8);
-                                           8);
+        dl_ch -= 8;
-        dl_ch += 16;
-        multadd_real_vector_complex_scalar(filt8_avlip4,
-                                           ch,
-                                           dl_ch,
-                                           8);
-        dl_ch += 16;
-        multadd_real_vector_complex_scalar(filt8_avlip5,
-                                           ch,
-                                           dl_ch,
-                                           8);
-        dl_ch -= 16;
 #endif
      }
-      ch_0 = ((int32_t)pil[0]*rxF[0] - (int32_t)pil[1]*rxF[1])>>15;
-      ch_1 = ((int32_t)pil[0]*rxF[1] + (int32_t)pil[1]*rxF[0])>>15;
-      pil += 2;
-      re_offset = (re_offset+2) % ue->frame_parms.ofdm_symbol_size;
-      rxF   = (int16_t *)&rxdataF[aarx][(symbol_offset+nushift+re_offset)];
-      ch_0 += ((int32_t)pil[0]*rxF[0] - (int32_t)pil[1]*rxF[1])>>15;
-      ch_1 += ((int32_t)pil[0]*rxF[1] + (int32_t)pil[1]*rxF[0])>>15;
-      pil += 2;
-      re_offset = (re_offset+2) % ue->frame_parms.ofdm_symbol_size;
-      rxF   = (int16_t *)&rxdataF[aarx][(symbol_offset+nushift+re_offset)];
-      ch_0 += ((int32_t)pil[0]*rxF[0] - (int32_t)pil[1]*rxF[1])>>15;
-      ch_1 += ((int32_t)pil[0]*rxF[1] + (int32_t)pil[1]*rxF[0])>>15;
-      pil += 2;
-      re_offset = (re_offset+2) % ue->frame_parms.ofdm_symbol_size;
-      rxF   = (int16_t *)&rxdataF[aarx][(symbol_offset+nushift+re_offset)];
-      ch_0 += ((int32_t)pil[0]*rxF[0] - (int32_t)pil[1]*rxF[1])>>15;
-      ch_1 += ((int32_t)pil[0]*rxF[1] + (int32_t)pil[1]*rxF[0])>>15;
-      pil += 2;
-      re_offset = (re_offset+2) % ue->frame_parms.ofdm_symbol_size;
-      rxF   = (int16_t *)&rxdataF[aarx][(symbol_offset+nushift+re_offset)];
-      ch_0 += ((int32_t)pil[0]*rxF[0] - (int32_t)pil[1]*rxF[1])>>15;
-      ch_1 += ((int32_t)pil[0]*rxF[1] + (int32_t)pil[1]*rxF[0])>>15;
-      pil += 2;
-      re_offset = (re_offset+2) % ue->frame_parms.ofdm_symbol_size;
-      rxF   = (int16_t *)&rxdataF[aarx][(symbol_offset+nushift+re_offset)];
-      ch_0 += ((int32_t)pil[0]*rxF[0] - (int32_t)pil[1]*rxF[1])>>15;
+      ch32.r = 0;
-      ch_1 += ((int32_t)pil[0]*rxF[1] + (int32_t)pil[1]*rxF[0])>>15;
+      ch32.i = 0;
+      for (int p_av = 0; p_av < P_average; p_av++) {
-      pil += 2;
+        ch32 = c32x16maddShift(*pil, *rxF, ch32, 15);
-      re_offset = (re_offset+2) % ue->frame_parms.ofdm_symbol_size;
+        pil++;
-      rxF   = (int16_t *)&rxdataF[aarx][(symbol_offset+nushift+re_offset)];
+        re_offset = (re_offset + 2) % ue->frame_parms.ofdm_symbol_size;
+        rxF = &rxdataF[aarx][(symbol_offset + nushift + re_offset)];
-      ch[0] = ch_0 / 6;
+      }
-      ch[1] = ch_1 / 6;
+      ch = c16x32div(ch32, P_average);
 #if NO_INTERP
-      for (int i=0;i<12;i++) ((int32_t*)dl_ch)[i] = *(int32_t*)ch;
+      for (int i = 0; i < 2 * P_average; i++) {
-      dl_ch+=24;
+        dl_ch[i] = ch;
+      }
+      dl_ch += 2 * P_average;
 #else
-      dl_ch[6] += (ch[0] * 1365)>>15; // 1/12*16384
+      dl_ch[3].r += (ch.r * 1365) >> 15; // 1/12*16384
-      dl_ch[7] += (ch[1] * 1365)>>15; // 1/12*16384
+      dl_ch[3].i += (ch.i * 1365) >> 15; // 1/12*16384
+      dl_ch += 4;
+      c16multaddVectRealComplex(filt8_avlip3, &ch, dl_ch, 8);
      dl_ch += 8;
-      multadd_real_vector_complex_scalar(filt8_avlip3,
+      c16multaddVectRealComplex(filt8_avlip6, &ch, dl_ch, 8);
-                                         ch,
-                                         dl_ch,
-                                         8);
-      dl_ch += 16;
-      multadd_real_vector_complex_scalar(filt8_avlip6,
-                                         ch,
-                                         dl_ch,
-                                         8);
 #endif
-    }
-    else  { // this is case without frequency-domain linear interpolation, just take average of LS channel estimates of 4 DMRS REs and use a common value for the whole PRB
-      int32_t ch_0, ch_1;
-      ch_0 = ((int32_t)pil[0]*rxF[0] - (int32_t)pil[1]*rxF[1])>>15;
-      ch_1 = ((int32_t)pil[0]*rxF[1] + (int32_t)pil[1]*rxF[0])>>15;
-      pil += 2;
+    // this is case without frequency-domain linear interpolation, just take average of LS channel estimates of 4 DMRS REs and use a common value for the whole PRB
-      re_offset = (re_offset+1) % ue->frame_parms.ofdm_symbol_size;
+    } else {
-      rxF   = (int16_t *)&rxdataF[aarx][(symbol_offset+nushift+re_offset)];
-      ch_0 += ((int32_t)pil[0]*rxF[0] - (int32_t)pil[1]*rxF[1])>>15;
-      ch_1 += ((int32_t)pil[0]*rxF[1] + (int32_t)pil[1]*rxF[0])>>15;
-      pil += 2;
-      re_offset = (re_offset+5) % ue->frame_parms.ofdm_symbol_size;
-      rxF   = (int16_t *)&rxdataF[aarx][(symbol_offset+nushift+re_offset)];
-      ch_0 += ((int32_t)pil[0]*rxF[0] - (int32_t)pil[1]*rxF[1])>>15;
-      ch_1 += ((int32_t)pil[0]*rxF[1] + (int32_t)pil[1]*rxF[0])>>15;
-      pil += 2;
-      re_offset = (re_offset+1) % ue->frame_parms.ofdm_symbol_size;
-      rxF   = (int16_t *)&rxdataF[aarx][(symbol_offset+nushift+re_offset)];
-      ch_0 += ((int32_t)pil[0]*rxF[0] - (int32_t)pil[1]*rxF[1])>>15;
-      ch_1 += ((int32_t)pil[0]*rxF[1] + (int32_t)pil[1]*rxF[0])>>15;
-      pil += 2;
-      re_offset = (re_offset+5) % ue->frame_parms.ofdm_symbol_size;
-      rxF   = (int16_t *)&rxdataF[aarx][(symbol_offset+nushift+re_offset)];
-      ch[0] = ch_0 / 4;
+      int P_average = 4;
-      ch[1] = ch_1 / 4;
+      c32_t ch32 = {0};
+      for (int p_av = 0; p_av < P_average; p_av++) {
+        ch32 = c32x16maddShift(*pil, *rxF, ch32, 15);
+        pil++;
+        re_offset = (re_offset + 1) % ue->frame_parms.ofdm_symbol_size;
+        rxF = &rxdataF[aarx][(symbol_offset + nushift + re_offset)];
+      }
+      ch = c16x32div(ch32, P_average);
 #if NO_INTERP
-      for (int i=0;i<12;i++) ((int32_t*)dl_ch)[i] = *(int32_t*)ch;
+      for (int i = 0; i < 3 * P_average; i++) {
-      dl_ch+=24;
+        dl_ch[i] = ch;
+      }
+      dl_ch += 3 * P_average;
 #else
-      multadd_real_vector_complex_scalar(filt8_avlip0,
+      c16multaddVectRealComplex(filt8_avlip0, &ch, dl_ch, 8);
-                                         ch,
+      dl_ch += 8;
-                                         dl_ch,
+      c16multaddVectRealComplex(filt8_avlip1, &ch, dl_ch, 8);
-                                         8);
+      dl_ch += 8;
+      c16multaddVectRealComplex(filt8_avlip2, &ch, dl_ch, 8);
-      dl_ch += 16;
+      dl_ch -= 12;
-      multadd_real_vector_complex_scalar(filt8_avlip1,
-                                         ch,
-                                         dl_ch,
-                                         8);
-      dl_ch += 16;
-      multadd_real_vector_complex_scalar(filt8_avlip2,
-                                         ch,
-                                         dl_ch,
-                                         8);
-      dl_ch -= 24;
 #endif
-      for (pilot_cnt=4; pilot_cnt<4*(nb_rb_pdsch-1); pilot_cnt += 4) {
+      for (int pilot_cnt = P_average; pilot_cnt < 4 * (nb_rb_pdsch - 1); pilot_cnt += P_average) {
-        int32_t ch_0, ch_1;
+        ch32.r = 0;
+        ch32.i = 0;
-        ch_0 = ((int32_t)pil[0]*rxF[0] - (int32_t)pil[1]*rxF[1])>>15;
+        for (int p_av = 0; p_av < P_average; p_av++) {
-        ch_1 = ((int32_t)pil[0]*rxF[1] + (int32_t)pil[1]*rxF[0])>>15;
+          ch32 = c32x16maddShift(*pil, *rxF, ch32, 15);
+          pil++;
-        pil += 2;
+          re_offset = (re_offset + 5) % ue->frame_parms.ofdm_symbol_size;
-        re_offset = (re_offset+1) % ue->frame_parms.ofdm_symbol_size;
+          rxF = &rxdataF[aarx][(symbol_offset + nushift + re_offset)];
-        rxF   = (int16_t *)&rxdataF[aarx][(symbol_offset+nushift+re_offset)];
+        }
+        ch = c16x32div(ch32, P_average);
-        ch_0 += ((int32_t)pil[0]*rxF[0] - (int32_t)pil[1]*rxF[1])>>15;
-        ch_1 += ((int32_t)pil[0]*rxF[1] + (int32_t)pil[1]*rxF[0])>>15;
-        pil += 2;
-        re_offset = (re_offset+5) % ue->frame_parms.ofdm_symbol_size;
-        rxF   = (int16_t *)&rxdataF[aarx][(symbol_offset+nushift+re_offset)];
-        ch_0 += ((int32_t)pil[0]*rxF[0] - (int32_t)pil[1]*rxF[1])>>15;
-        ch_1 += ((int32_t)pil[0]*rxF[1] + (int32_t)pil[1]*rxF[0])>>15;
-        pil += 2;
-        re_offset = (re_offset+1) % ue->frame_parms.ofdm_symbol_size;
-        rxF   = (int16_t *)&rxdataF[aarx][(symbol_offset+nushift+re_offset)];
-        ch_0 += ((int32_t)pil[0]*rxF[0] - (int32_t)pil[1]*rxF[1])>>15;
-        ch_1 += ((int32_t)pil[0]*rxF[1] + (int32_t)pil[1]*rxF[0])>>15;
-        pil += 2;
-        re_offset = (re_offset+5) % ue->frame_parms.ofdm_symbol_size;
-        rxF   = (int16_t *)&rxdataF[aarx][(symbol_offset+nushift+re_offset)];
-        ch[0] = ch_0 / 4;
-        ch[1] = ch_1 / 4;
 #if NO_INTERP
-        for (int i=0;i<12;i++) ((int32_t*)dl_ch)[i] = *(int32_t*)ch;
+        for (int i = 0; i < 3 * P_average; i++) {
-        dl_ch+=24;
+          dl_ch[i] = ch;
+        }
+        dl_ch += 3 * P_average;
 #else
-        dl_ch[6] += (ch[0] * 1365)>>15; // 1/12*16384
+        dl_ch[3].r += (ch.r * 1365) >> 15; // 1/12*16384
-        dl_ch[7] += (ch[1] * 1365)>>15; // 1/12*16384
+        dl_ch[3].i += (ch.i * 1365) >> 15; // 1/12*16384
+        dl_ch += 4;
+        c16multaddVectRealComplex(filt8_avlip3, &ch, dl_ch, 8);
        dl_ch += 8;
-        multadd_real_vector_complex_scalar(filt8_avlip3,
+        c16multaddVectRealComplex(filt8_avlip4, &ch, dl_ch, 8);
-                                           ch,
+        dl_ch += 8;
-                                           dl_ch,
+        c16multaddVectRealComplex(filt8_avlip5, &ch, dl_ch, 8);
-                                           8);
+        dl_ch -= 8;
-        dl_ch += 16;
-        multadd_real_vector_complex_scalar(filt8_avlip4,
-                                           ch,
-                                           dl_ch,
-                                           8);
-        dl_ch += 16;
-        multadd_real_vector_complex_scalar(filt8_avlip5,
-                                           ch,
-                                           dl_ch,
-                                           8);
-        dl_ch -= 16;
 #endif
      }
-      ch_0 = ((int32_t)pil[0]*rxF[0] - (int32_t)pil[1]*rxF[1])>>15;
+      ch32.r = 0;
-      ch_1 = ((int32_t)pil[0]*rxF[1] + (int32_t)pil[1]*rxF[0])>>15;
+      ch32.i = 0;
+      for (int p_av = 0; p_av < P_average; p_av++) {
-      pil += 2;
+        ch32 = c32x16maddShift(*pil, *rxF, ch32, 15);
-      re_offset = (re_offset+1) % ue->frame_parms.ofdm_symbol_size;
+        pil++;
-      rxF   = (int16_t *)&rxdataF[aarx][(symbol_offset+nushift+re_offset)];
+        re_offset = (re_offset + 5) % ue->frame_parms.ofdm_symbol_size;
+        rxF = &rxdataF[aarx][(symbol_offset + nushift + re_offset)];
-      ch_0 += ((int32_t)pil[0]*rxF[0] - (int32_t)pil[1]*rxF[1])>>15;
+      }
-      ch_1 += ((int32_t)pil[0]*rxF[1] + (int32_t)pil[1]*rxF[0])>>15;
+      ch = c16x32div(ch32, P_average);
-      pil += 2;
-      re_offset = (re_offset+5) % ue->frame_parms.ofdm_symbol_size;
-      rxF   = (int16_t *)&rxdataF[aarx][(symbol_offset+nushift+re_offset)];
-      ch_0 += ((int32_t)pil[0]*rxF[0] - (int32_t)pil[1]*rxF[1])>>15;
-      ch_1 += ((int32_t)pil[0]*rxF[1] + (int32_t)pil[1]*rxF[0])>>15;
-      pil += 2;
-      re_offset = (re_offset+1) % ue->frame_parms.ofdm_symbol_size;
-      rxF   = (int16_t *)&rxdataF[aarx][(symbol_offset+nushift+re_offset)];
-      ch_0 += ((int32_t)pil[0]*rxF[0] - (int32_t)pil[1]*rxF[1])>>15;
-      ch_1 += ((int32_t)pil[0]*rxF[1] + (int32_t)pil[1]*rxF[0])>>15;
-      pil += 2;
-      re_offset = (re_offset+5) % ue->frame_parms.ofdm_symbol_size;
-      rxF   = (int16_t *)&rxdataF[aarx][(symbol_offset+nushift+re_offset)];
-      ch[0] = ch_0 / 4;
-      ch[1] = ch_1 / 4;
 #if NO_INTERP
-      for (int i=0;i<12;i++) ((int32_t*)dl_ch)[i] = *(int32_t*)ch;
+      for (int i = 0; i < 3 * P_average; i++) {
-      dl_ch+=24;
+        dl_ch[i] = ch;
+      }
+      dl_ch += 3 * P_average;
 #else
-      dl_ch[6] += (ch[0] * 1365)>>15; // 1/12*16384
+      dl_ch[3].r += (ch.r * 1365) >> 15; // 1/12*16384
-      dl_ch[7] += (ch[1] * 1365)>>15; // 1/12*16384
+      dl_ch[3].i += (ch.i * 1365) >> 15; // 1/12*16384
+      dl_ch += 4;
+      c16multaddVectRealComplex(filt8_avlip3, &ch, dl_ch, 8);
      dl_ch += 8;
-      multadd_real_vector_complex_scalar(filt8_avlip3,
+      c16multaddVectRealComplex(filt8_avlip6, &ch, dl_ch, 8);
-                                         ch,
-                                         dl_ch,
-                                         8);
-      dl_ch += 16;
-      multadd_real_vector_complex_scalar(filt8_avlip6,
-                                         ch,
-                                         dl_ch,
-                                         8);
 #endif
    }
 #ifdef DEBUG_PDSCH
-    dl_ch = (int16_t *)&dl_ch_estimates[p*ue->frame_parms.nb_antennas_rx+aarx][ch_offset];
+    dl_ch = (c16_t *)&dl_ch_estimates[p * ue->frame_parms.nb_antennas_rx + aarx][ch_offset];
-    for(uint16_t idxP=0; idxP<ceil((float)nb_rb_pdsch*12/8); idxP++) {
+    for (uint16_t idxP = 0; idxP < ceil((float)nb_rb_pdsch * 12 / 8); idxP++) {
-      for(uint8_t idxI=0; idxI<16; idxI += 2) {
+      for (uint8_t idxI = 0; idxI < 8; idxI++) {
-        printf("%4d\t%4d\t",dl_ch[idxP*16+idxI],dl_ch[idxP*16+idxI+1]);
+        printf("%4d\t%4d\t", dl_ch[idxP * 8 + idxI].r, dl_ch[idxP * 8 + idxI].i);
      }
-      printf("%2d\n",idxP);
+      printf("%2d\n", idxP);
    }
 #endif
  }
-  return(0);
+  return 0;
 }
 /*******************************************************************

--- a/openair1/PHY/TOOLS/tools_defs.h
+++ b/openair1/PHY/TOOLS/tools_defs.h
@@ -97,6 +97,20 @@ extern "C" {
 #define squaredMod(a) ((a).r*(a).r + (a).i*(a).i)
 #define csum(res, i1, i2) (res).r = (i1).r + (i2).r ; (res).i = (i1).i + (i2).i
+  __attribute__((always_inline)) inline c16_t c16Shift(const c16_t a, const int Shift) {
+    return (c16_t) {
+        .r = (int16_t)(a.r >> Shift),
+        .i = (int16_t)(a.i >> Shift)
+    };
+  }
+  __attribute__((always_inline)) inline c16_t c16addShift(const c16_t a, const c16_t b, const int Shift) {
+    return (c16_t) {
+        .r = (int16_t)((a.r + b.r) >> Shift),
+        .i = (int16_t)((a.i + b.i) >> Shift)
+    };
+  }
  __attribute__((always_inline)) inline c16_t c16mulShift(const c16_t a, const c16_t b, const int Shift) {
    return (c16_t) {
      .r = (int16_t)((a.r * b.r - a.i * b.i) >> Shift),