Commit d74b2bd1 authored by laurent's avatar laurent

improve dot_product function

parent 8773e423
......@@ -35,8 +35,7 @@ int32x4_t avg128F;
#endif
//compute average channel_level on each (TX,RX) antenna pair
int dl_channel_level(int16_t *dl_ch,
LTE_DL_FRAME_PARMS *frame_parms)
int dl_channel_level(c16_t *dl_ch, LTE_DL_FRAME_PARMS *frame_parms)
{
int16_t rb;
......@@ -100,13 +99,9 @@ int lte_est_freq_offset(int **dl_ch_estimates,
int* freq_offset,
int reset)
{
int ch_offset, omega, dl_ch_shift;
c16_t omega_cpx;
int ch_offset;
double phase_offset;
int freq_offset_est;
unsigned char aa;
int16_t *dl_ch,*dl_ch_prev;
static int first_run = 1;
int coef = 1<<10;
int ncoef = 32767 - coef;
......@@ -124,49 +119,40 @@ int lte_est_freq_offset(int **dl_ch_estimates,
phase_offset = 0.0;
// for (aa=0;aa<frame_parms->nb_antennas_rx*frame_parms->nb_antennas_tx;aa++) {
for (aa=0; aa<1; aa++) {
dl_ch = (int16_t *)&dl_ch_estimates[aa][12+ch_offset];
// Warning: only one antenna used
for (int aa = 0; aa < 1; aa++) {
c16_t *dl_ch = (c16_t *)&dl_ch_estimates[aa][12 + ch_offset];
dl_ch_shift = 6+(log2_approx(dl_channel_level(dl_ch,frame_parms))/2);
// printf("dl_ch_shift: %d\n",dl_ch_shift);
int dl_ch_shift = 6 + (log2_approx(dl_channel_level(dl_ch, frame_parms)) / 2);
c16_t *dl_ch_prev;
if (ch_offset == 0)
dl_ch_prev = (int16_t *)&dl_ch_estimates[aa][12+(4-frame_parms->Ncp)*(frame_parms->ofdm_symbol_size)];
dl_ch_prev = (c16_t *)&dl_ch_estimates[aa][12 + (4 - frame_parms->Ncp) * (frame_parms->ofdm_symbol_size)];
else
dl_ch_prev = (int16_t *)&dl_ch_estimates[aa][12+0];
dl_ch_prev = (c16_t *)&dl_ch_estimates[aa][12 + 0];
// calculate omega = angle(conj(dl_ch)*dl_ch_prev))
// printf("Computing freq_offset\n");
omega = dot_product(dl_ch,dl_ch_prev,(frame_parms->N_RB_DL/2 - 1)*12,dl_ch_shift);
//omega = dot_product(dl_ch,dl_ch_prev,frame_parms->ofdm_symbol_size,15);
omega_cpx.r = ((c16_t*) &omega)->r;
omega_cpx.i = ((c16_t*) &omega)->i;
c32_t omega_tmp = dot_product(dl_ch, dl_ch_prev, (frame_parms->N_RB_DL / 2 - 1) * 12, dl_ch_shift);
cd_t omega_cpx = {omega_tmp.r, omega_tmp.i};
// omega = dot_product(dl_ch,dl_ch_prev,frame_parms->ofdm_symbol_size,15);
dl_ch = (int16_t *)&dl_ch_estimates[aa][(((frame_parms->N_RB_DL/2) + 1)*12) + ch_offset];
dl_ch = (c16_t *)&dl_ch_estimates[aa][(((frame_parms->N_RB_DL / 2) + 1) * 12) + ch_offset];
if (ch_offset == 0)
dl_ch_prev = (int16_t *)&dl_ch_estimates[aa][(((frame_parms->N_RB_DL/2) + 1)*12)+(4-frame_parms->Ncp)*(frame_parms->ofdm_symbol_size)];
dl_ch_prev = (c16_t *)&dl_ch_estimates[aa][(((frame_parms->N_RB_DL / 2) + 1) * 12) + (4 - frame_parms->Ncp) * (frame_parms->ofdm_symbol_size)];
else
dl_ch_prev = (int16_t *)&dl_ch_estimates[aa][((frame_parms->N_RB_DL/2) + 1)*12];
dl_ch_prev = (c16_t *)&dl_ch_estimates[aa][((frame_parms->N_RB_DL / 2) + 1) * 12];
// calculate omega = angle(conj(dl_ch)*dl_ch_prev))
omega = dot_product(dl_ch,dl_ch_prev,((frame_parms->N_RB_DL/2) - 1)*12,dl_ch_shift);
omega_cpx.r += ((c16_t*) &omega)->r;
omega_cpx.i += ((c16_t*) &omega)->i;
c32_t omega = dot_product(dl_ch, dl_ch_prev, ((frame_parms->N_RB_DL / 2) - 1) * 12, dl_ch_shift);
// phase_offset += atan2((double)omega_cpx->i,(double)omega_cpx->r);
phase_offset += atan2((double)omega_cpx.i,(double)omega_cpx.r);
// LOG_I(PHY,"omega (%d,%d) -> %f\n",omega_cpx->r,omega_cpx->i,phase_offset);
omega_cpx.r += omega.r;
omega_cpx.i += omega.i;
phase_offset += atan2(omega_cpx.i, omega_cpx.r);
}
// phase_offset /= (frame_parms->nb_antennas_rx*frame_parms->nb_antennas_tx);
freq_offset_est = (int) (phase_offset/(2*M_PI)/(frame_parms->Ncp==NORMAL ? (285.8e-6):(2.5e-4))); //2.5e-4 is the time between pilot symbols
// LOG_I(PHY,"symbol %d : freq_offset_est %d\n",l,freq_offset_est);
// update freq_offset with phase_offset using a moving average filter
if (first_run == 1) {
......@@ -175,14 +161,6 @@ int lte_est_freq_offset(int **dl_ch_estimates,
} else
*freq_offset = ((freq_offset_est * coef) + (*freq_offset * ncoef)) >> 15;
//#ifdef DEBUG_PHY
// msg("l=%d, phase_offset = %f (%d,%d), freq_offset_est = %d Hz, freq_offset_filt = %d \n",l,phase_offset,omega_cpx->r,omega_cpx->i,freq_offset_est,*freq_offset);
/*
for (i=0;i<150;i++)
msg("i %d : %d,%d <=> %d,%d\n",i,dl_ch[2*i],dl_ch[(2*i)+1],dl_ch_prev[2*i],dl_ch_prev[(2*i)+1]);
*/
//#endif
return(0);
}
//******************************************************************************************************
......@@ -193,13 +171,9 @@ int lte_mbsfn_est_freq_offset(int **dl_ch_estimates,
int l,
int* freq_offset)
{
int ch_offset, omega, dl_ch_shift;
c16_t *omega_cpx;
int ch_offset;
double phase_offset;
int freq_offset_est;
unsigned char aa;
int16_t *dl_ch,*dl_ch_prev;
static int first_run = 1;
int coef = 1<<10;
int ncoef = 32767 - coef;
......@@ -213,52 +187,38 @@ int lte_mbsfn_est_freq_offset(int **dl_ch_estimates,
}
phase_offset = 0.0;
// Warning: only one antenna used
for (int aa = 0; aa < 1; aa++) {
c16_t *dl_ch_prev;
c16_t *dl_ch = (c16_t *)&dl_ch_estimates[aa][12 + ch_offset];
// for (aa=0;aa<frame_parms->nb_antennas_rx*frame_parms->nb_antennas_tx;aa++) {
for (aa=0; aa<1; aa++) {
dl_ch = (int16_t *)&dl_ch_estimates[aa][12+ch_offset];
dl_ch_shift = 4+(log2_approx(dl_channel_level(dl_ch,frame_parms))/2);
int dl_ch_shift = 4 + (log2_approx(dl_channel_level(dl_ch, frame_parms)) / 2);
// printf("dl_ch_shift: %d\n",dl_ch_shift);
if (ch_offset == 0)
dl_ch_prev = (int16_t *)&dl_ch_estimates[aa][12+(10*(frame_parms->ofdm_symbol_size))];
dl_ch_prev = (c16_t *)&dl_ch_estimates[aa][12 + (10 * (frame_parms->ofdm_symbol_size))];
else
dl_ch_prev = (int16_t *)&dl_ch_estimates[aa][12+6];
//else if
// dl_ch_prev = (int16_t *)&dl_ch_estimates[aa][12+0];
// calculate omega = angle(conj(dl_ch)*dl_ch_prev))
// printf("Computing freq_offset\n");
omega = dot_product(dl_ch,dl_ch_prev,(frame_parms->N_RB_DL/2 - 1)*12,dl_ch_shift);
//omega = dot_product(dl_ch,dl_ch_prev,frame_parms->ofdm_symbol_size,15);
omega_cpx = (c16_t*) &omega;
dl_ch_prev = (c16_t *)&dl_ch_estimates[aa][12 + 6];
// printf("omega (%d,%d)\n",omega_cpx->r,omega_cpx->i);
const c32_t omega_tmp = dot_product(dl_ch, dl_ch_prev, (frame_parms->N_RB_DL / 2 - 1) * 12, dl_ch_shift);
cd_t omega_cpx = {omega_tmp.r, omega_tmp.i};
dl_ch = (int16_t *)&dl_ch_estimates[aa][(((frame_parms->N_RB_DL/2) + 1)*12) + ch_offset];
dl_ch = (c16_t *)&dl_ch_estimates[aa][(((frame_parms->N_RB_DL / 2) + 1) * 12) + ch_offset];
if (ch_offset == 0)
dl_ch_prev = (int16_t *)&dl_ch_estimates[aa][(((frame_parms->N_RB_DL/2) + 1)*12)+10*(frame_parms->ofdm_symbol_size)];
dl_ch_prev = (c16_t *)&dl_ch_estimates[aa][(((frame_parms->N_RB_DL / 2) + 1) * 12) + 10 * (frame_parms->ofdm_symbol_size)];
else
dl_ch_prev = (int16_t *)&dl_ch_estimates[aa][((frame_parms->N_RB_DL/2) + 1)*12+6];
dl_ch_prev = (c16_t *)&dl_ch_estimates[aa][((frame_parms->N_RB_DL / 2) + 1) * 12 + 6];
// calculate omega = angle(conj(dl_ch)*dl_ch_prev))
omega = dot_product(dl_ch,dl_ch_prev,((frame_parms->N_RB_DL/2) - 1)*12,dl_ch_shift);
omega_cpx->r += ((c16_t*) &omega)->r;
omega_cpx->i += ((c16_t*) &omega)->i;
// phase_offset += atan2((double)omega_cpx->i,(double)omega_cpx->r);
phase_offset += atan2((double)omega_cpx->i,(double)omega_cpx->r);
// printf("omega (%d,%d) -> %f\n",omega_cpx->r,omega_cpx->i,phase_offset);
c32_t omega2 = dot_product(dl_ch, dl_ch_prev, ((frame_parms->N_RB_DL / 2) - 1) * 12, dl_ch_shift);
omega_cpx.r += omega2.r;
omega_cpx.i += omega2.i;
phase_offset += atan2(omega_cpx.i, omega_cpx.r);
}
// phase_offset /= (frame_parms->nb_antennas_rx*frame_parms->nb_antennas_tx);
freq_offset_est = (int) (phase_offset/(2*M_PI)/2.5e-4); //2.5e-4 is the time between pilot symbols
// printf("symbol %d : freq_offset_est %d\n",l,freq_offset_est);
// update freq_offset with phase_offset using a moving average filter
if (first_run == 1) {
*freq_offset = freq_offset_est;
......@@ -266,13 +226,5 @@ int lte_mbsfn_est_freq_offset(int **dl_ch_estimates,
} else
*freq_offset = ((freq_offset_est * coef) + (*freq_offset * ncoef)) >> 15;
//#ifdef DEBUG_PHY
// msg("l=%d, phase_offset = %f (%d,%d), freq_offset_est = %d Hz, freq_offset_filt = %d \n",l,phase_offset,omega_cpx->r,omega_cpx->i,freq_offset_est,*freq_offset);
/*
for (i=0;i<150;i++)
msg("i %d : %d,%d <=> %d,%d\n",i,dl_ch[2*i],dl_ch[(2*i)+1],dl_ch_prev[2*i],dl_ch_prev[(2*i)+1]);
*/
//#endif
return(0);
}
......@@ -75,28 +75,6 @@ void lte_sync_timefreq(PHY_VARS_UE *ue,
int band,
unsigned int DL_freq);
/*!
\brief This function performs detection of the PRACH (=SRS) at the eNb to estimate the timing advance
The algorithm uses a time domain correlation with a downsampled version of the received signal.
\param rxdata Received time domain data for all rx antennas
\param frame_parms LTE DL frame parameter structure
\param length Length for correlation
\param peak_val pointer to value of returned peak
\param sync_corr_eNb pointer to correlation buffer
\return sync_pos Position of the sync within the frame (downsampled) if successfull and -1 if there was an error or no peak was detected.
*/
int lte_sync_time_eNB(int32_t **rxdata,
LTE_DL_FRAME_PARMS *frame_parms,
uint32_t length,
uint32_t *peak_val,
uint32_t *sync_corr_eNb);
int lte_sync_time_eNB_emul(PHY_VARS_eNB *phy_vars_eNb,
uint8_t sect_id,
int32_t *sync_val);
int ru_sync_time_init(RU_t *ru);
int ru_sync_time(RU_t *ru,
......
......@@ -34,9 +34,7 @@
// Note: this is for prototype of generate_drs_pusch (OTA synchronization of RRUs)
#include "PHY/LTE_UE_TRANSPORT/transport_proto_ue.h"
static c16_t *primary_synch0_time __attribute__((aligned(32)));
static c16_t *primary_synch1_time __attribute__((aligned(32)));
static c16_t *primary_synch2_time __attribute__((aligned(32)));
static c16_t *primary_synch_time[3] __attribute__((aligned(32)));
static void doIdft(int size, short *in, short *out) {
switch (size) {
......@@ -84,25 +82,23 @@ static void copyPrimary( c16_t *out, struct complex16 *in, int ofdmSize) {
int lte_sync_time_init(LTE_DL_FRAME_PARMS *frame_parms ) { // LTE_UE_COMMON *common_vars
c16_t syncF_tmp[2048]__attribute__((aligned(32)))= {{0}};
int sz=frame_parms->ofdm_symbol_size*sizeof(*primary_synch0_time);
AssertFatal( NULL != (primary_synch0_time = (c16_t *)malloc16(sz)),"");
bzero(primary_synch0_time,sz);
AssertFatal( NULL != (primary_synch1_time = (c16_t *)malloc16(sz)),"");
bzero(primary_synch1_time,sz);
AssertFatal( NULL != (primary_synch2_time = (c16_t *)malloc16(sz)),"");
bzero(primary_synch2_time,sz);
int sz = frame_parms->ofdm_symbol_size * sizeof(**primary_synch_time);
for (int i = 0; i < sizeofArray(primary_synch_time); i++) {
primary_synch_time[i] = (c16_t *)malloc16_clear(sz);
AssertFatal(primary_synch_time[i],"no memory\n");
}
// generate oversampled sync_time sequences
copyPrimary( syncF_tmp, (c16_t *) primary_synch0, frame_parms->ofdm_symbol_size);
doIdft(frame_parms->N_RB_DL, (short *)syncF_tmp,(short *)primary_synch0_time);
doIdft(frame_parms->N_RB_DL, (short *)syncF_tmp, (short *)primary_synch_time[0]);
copyPrimary( syncF_tmp, (c16_t *) primary_synch1, frame_parms->ofdm_symbol_size);
doIdft(frame_parms->N_RB_DL, (short *)syncF_tmp,(short *)primary_synch1_time);
doIdft(frame_parms->N_RB_DL, (short *)syncF_tmp, (short *)primary_synch_time[1]);
copyPrimary( syncF_tmp, (c16_t *) primary_synch2, frame_parms->ofdm_symbol_size);
doIdft(frame_parms->N_RB_DL, (short *)syncF_tmp,(short *)primary_synch2_time);
doIdft(frame_parms->N_RB_DL, (short *)syncF_tmp, (short *)primary_synch_time[2]);
if ( LOG_DUMPFLAG(DEBUG_LTEESTIM)){
LOG_M("primary_sync0.m","psync0",primary_synch0_time,frame_parms->ofdm_symbol_size,1,1);
LOG_M("primary_sync1.m","psync1",primary_synch1_time,frame_parms->ofdm_symbol_size,1,1);
LOG_M("primary_sync2.m","psync2",primary_synch2_time,frame_parms->ofdm_symbol_size,1,1);
LOG_M("primary_sync0.m", "psync0", primary_synch_time[0], frame_parms->ofdm_symbol_size, 1, 1);
LOG_M("primary_sync1.m", "psync1", primary_synch_time[1], frame_parms->ofdm_symbol_size, 1, 1);
LOG_M("primary_sync2.m", "psync2", primary_synch_time[2], frame_parms->ofdm_symbol_size, 1, 1);
}
return (1);
......@@ -110,24 +106,11 @@ int lte_sync_time_init(LTE_DL_FRAME_PARMS *frame_parms ) { // LTE_UE_COMMON *com
void lte_sync_time_free(void) {
if (primary_synch0_time) {
LOG_D(PHY,"Freeing primary_sync0_time ...\n");
free(primary_synch0_time);
}
if (primary_synch1_time) {
LOG_D(PHY,"Freeing primary_sync1_time ...\n");
free(primary_synch1_time);
}
if (primary_synch2_time) {
LOG_D(PHY,"Freeing primary_sync2_time ...\n");
free(primary_synch2_time);
}
primary_synch0_time = NULL;
primary_synch1_time = NULL;
primary_synch2_time = NULL;
for (int i = 0; i < sizeofArray(primary_synch_time); i++)
if (primary_synch_time[i]) {
LOG_D(PHY, "Freeing primary_sync0_time ...\n");
free_and_zero(primary_synch_time[i]);
}
}
......@@ -141,7 +124,6 @@ int lte_sync_time(int **rxdata, ///rx data in time domain
int *eNB_id) {
// perform a time domain correlation using the oversampled sync sequence
unsigned int n, ar, s, peak_pos, peak_val, sync_source;
int result,result2;
struct complexd sync_out[3]= {{0}}, sync_out2[3]= {{0}};
int length = LTE_NUMBER_OF_SUBFRAMES_PER_FRAME*frame_parms->samples_per_tti>>1;
peak_val = 0;
......@@ -157,32 +139,15 @@ int lte_sync_time(int **rxdata, ///rx data in time domain
// if (n<(length-frame_parms->ofdm_symbol_size-frame_parms->nb_prefix_samples)) {
if (n<(length-frame_parms->ofdm_symbol_size)) {
//calculate dot product of primary_synch0_time and rxdata[ar][n] (ar=0..nb_ant_rx) and store the sum in temp[n];
for (ar=0; ar<frame_parms->nb_antennas_rx; ar++) {
result = dot_product((short *)primary_synch0_time, (short *) &(rxdata[ar][n]), frame_parms->ofdm_symbol_size, SHIFT);
result2 = dot_product((short *)primary_synch0_time, (short *) &(rxdata[ar][n+length]), frame_parms->ofdm_symbol_size, SHIFT);
sync_out[0].r += ((short *) &result)[0];
sync_out[0].i += ((short *) &result)[1];
sync_out2[0].r += ((short *) &result2)[0];
sync_out2[0].i += ((short *) &result2)[1];
}
for (ar=0; ar<frame_parms->nb_antennas_rx; ar++) {
result = dot_product((short *)primary_synch1_time, (short *) &(rxdata[ar][n]), frame_parms->ofdm_symbol_size, SHIFT);
result2 = dot_product((short *)primary_synch1_time, (short *) &(rxdata[ar][n+length]), frame_parms->ofdm_symbol_size, SHIFT);
sync_out[1].r += ((short *) &result)[0];
sync_out[1].i += ((short *) &result)[1];
sync_out2[1].r += ((short *) &result2)[0];
sync_out2[1].i += ((short *) &result2)[1];
}
for (ar=0; ar<frame_parms->nb_antennas_rx; ar++) {
result = dot_product((short *)primary_synch2_time, (short *) &(rxdata[ar][n]), frame_parms->ofdm_symbol_size, SHIFT);
result2 = dot_product((short *)primary_synch2_time, (short *) &(rxdata[ar][n+length]), frame_parms->ofdm_symbol_size, SHIFT);
sync_out[2].r += ((short *) &result)[0];
sync_out[2].i += ((short *) &result)[1];
sync_out2[2].r += ((short *) &result2)[0];
sync_out2[2].i += ((short *) &result2)[1];
}
for (int i = 0; i < sizeofArray(primary_synch_time); i++)
for (ar = 0; ar < frame_parms->nb_antennas_rx; ar++) {
c32_t result = dot_product(primary_synch_time[i], (c16_t *)&(rxdata[ar][n]), frame_parms->ofdm_symbol_size, SHIFT);
c32_t result2 = dot_product(primary_synch_time[i], (c16_t *)&(rxdata[ar][n + length]), frame_parms->ofdm_symbol_size, SHIFT);
sync_out[i].r += result.r;
sync_out[i].i += result.i;
sync_out2[i].r += result2.r;
sync_out2[i].i += result2.i;
}
}
// calculate the absolute value of sync_corr[n]
......@@ -274,88 +239,6 @@ void ru_sync_time_free(RU_t *ru) {
free(ru->dmrs_corr);
}
//#define DEBUG_PHY
int lte_sync_time_eNB(int32_t **rxdata, ///rx data in time domain
LTE_DL_FRAME_PARMS *frame_parms,
uint32_t length,
uint32_t *peak_val_out,
uint32_t *sync_corr_eNB) {
// perform a time domain correlation using the oversampled sync sequence
unsigned int n, ar, peak_val, peak_pos;
uint64_t mean_val;
int result;
short *primary_synch_time;
int eNB_id = frame_parms->Nid_cell%3;
// LOG_E(PHY,"[SYNC TIME] Calling sync_time_eNB(%p,%p,%d,%d)\n",rxdata,frame_parms,eNB_id,length);
if (sync_corr_eNB == NULL) {
LOG_E(PHY,"[SYNC TIME] sync_corr_eNB not yet allocated! Exiting.\n");
return(-1);
}
switch (eNB_id) {
case 0:
primary_synch_time = (short *)primary_synch0_time;
break;
case 1:
primary_synch_time = (short *)primary_synch1_time;
break;
case 2:
primary_synch_time = (short *)primary_synch2_time;
break;
default:
LOG_E(PHY,"[SYNC TIME] Illegal eNB_id!\n");
return (-1);
}
peak_val = 0;
peak_pos = 0;
mean_val = 0;
for (n=0; n<length; n+=4) {
sync_corr_eNB[n] = 0;
if (n<(length-frame_parms->ofdm_symbol_size-frame_parms->nb_prefix_samples)) {
//calculate dot product of primary_synch0_time and rxdata[ar][n] (ar=0..nb_ant_rx) and store the sum in temp[n];
for (ar=0; ar<frame_parms->nb_antennas_rx; ar++) {
result = dot_product((short *)primary_synch_time, (short *) &(rxdata[ar][n]), frame_parms->ofdm_symbol_size, SHIFT);
//((short*)sync_corr)[2*n] += ((short*) &result)[0];
//((short*)sync_corr)[2*n+1] += ((short*) &result)[1];
sync_corr_eNB[n] += squaredMod(*(c16_t*)&result);
}
}
/*
if (eNB_id == 2) {
printf("sync_time_eNB %d : %d,%d (%d)\n",n,sync_corr_eNB[n],mean_val,
peak_val);
}
*/
mean_val += sync_corr_eNB[n];
if (sync_corr_eNB[n]>peak_val) {
peak_val = sync_corr_eNB[n];
peak_pos = n;
}
}
mean_val/=length;
*peak_val_out = peak_val;
if (peak_val <= (40*(uint32_t)mean_val)) {
LOG_I(PHY,"[SYNC TIME] No peak found (%u,%u,%"PRIu64",%"PRIu64")\n",peak_pos,peak_val,mean_val,40*mean_val);
return(-1);
} else {
LOG_I(PHY,"[SYNC TIME] Peak found at pos %u, val = %u, mean_val = %"PRIu64"\n",peak_pos,peak_val,mean_val);
return(peak_pos);
}
}
int ru_sync_time(RU_t *ru,
int64_t *lev,
int64_t *avg) {
......@@ -372,8 +255,7 @@ int ru_sync_time(RU_t *ru,
int32_t maxlev0=0;
int maxpos0=0;
int64_t avg0=0;
int64_t result;
int64_t avg0 = 0;
int64_t dmrs_corr;
int maxval=0;
......@@ -396,19 +278,12 @@ int ru_sync_time(RU_t *ru,
//calculate dot product of primary_synch0_time and rxdata[ar][n] (ar=0..nb_ant_rx) and store the sum in temp[n];
for (int ar=0; ar<ru->nb_rx; ar++) {
result = dot_product64(ru->dmrssync,
(int16_t *) &ru->common.rxdata[ar][n],
frame_parms->ofdm_symbol_size,
shift);
if (ru->state == RU_CHECK_SYNC) {
result = dot_product64((int16_t *) &calibration->drs_ch_estimates_time[ar],
(int16_t *) &ru->common.rxdata[ar][n],
frame_parms->ofdm_symbol_size,
shift);
}
dmrs_corr += squaredMod(*(c32_t*)&result);
const c16_t *input = ru->state == RU_CHECK_SYNC ?
(c16_t *)&calibration->drs_ch_estimates_time[ar] :
(c16_t *)ru->dmrssync;
c32_t result = dot_product(input, (c16_t *)&ru->common.rxdata[ar][n], frame_parms->ofdm_symbol_size, shift);
const c64_t tmp = {result.r, result.i};
dmrs_corr += squaredMod(tmp);
}
if (ru->dmrs_corr != NULL)
......
......@@ -651,7 +651,6 @@ int pss_synchro_nr(PHY_VARS_NR_UE *PHY_vars_UE, int is, int rate_change)
*
*********************************************************************/
#define DOT_PRODUCT_SCALING_SHIFT (17)
int pss_search_time_nr(int **rxdata, ///rx data in time domain
NR_DL_FRAME_PARMS *frame_parms,
int fo_flag,
......@@ -694,8 +693,7 @@ int pss_search_time_nr(int **rxdata, ///rx data in time domain
/* Correlation computation is based on a a dot product which is realized thank to SIMS extensions */
for (int pss_index = 0; pss_index < NUMBER_PSS_SEQUENCE; pss_index++) {
for (n=0; n < length; n+=4) { //
for (n = 0; n < length; n += 8) { //
int64_t pss_corr_ue=0;
/* calculate dot product of primary_synchro_time_nr and rxdata[ar][n]
......@@ -703,12 +701,8 @@ int pss_search_time_nr(int **rxdata, ///rx data in time domain
for (ar=0; ar<frame_parms->nb_antennas_rx; ar++) {
/* perform correlation of rx data and pss sequence ie it is a dot product */
const int64_t result = dot_product64((short *)primary_synchro_time_nr[pss_index],
(short *)&(rxdata[ar][n + is * frame_parms->samples_per_frame]),
frame_parms->ofdm_symbol_size,
shift);
const c32_t r32 = *(c32_t*)&result;
const c64_t r64 = {.r = r32.r, .i = r32.i};
const c32_t result = dot_product((c16_t *)primary_synchro_time_nr[pss_index], (c16_t *)&(rxdata[ar][n + is * frame_parms->samples_per_frame]), frame_parms->ofdm_symbol_size, shift);
const c64_t r64 = {.r = result.r, .i = result.i};
pss_corr_ue += squaredMod(r64);
//((short*)pss_corr_ue[pss_index])[2*n] += ((short*) &result)[0]; /* real part */
//((short*)pss_corr_ue[pss_index])[2*n+1] += ((short*) &result)[1]; /* imaginary part */
......@@ -736,27 +730,17 @@ int pss_search_time_nr(int **rxdata, ///rx data in time domain
// fractional frequency offset computation according to Cross-correlation Synchronization Algorithm Using PSS
// Shoujun Huang, Yongtao Su, Ying He and Shan Tang, "Joint time and frequency offset estimation in LTE downlink," 7th International Conference on Communications and Networking in China, 2012.
int64_t result1,result2;
// Computing cross-correlation at peak on half the symbol size for first half of data
result1 = dot_product64((short*)primary_synchro_time_nr[pss_source],
(short*) &(rxdata[0][peak_position+is*frame_parms->samples_per_frame]),
frame_parms->ofdm_symbol_size>>1,
shift);
// Computing cross-correlation at peak on half the symbol size for data shifted by half symbol size
// as it is real and complex it is necessary to shift by a value equal to symbol size to obtain such shift
result2 = dot_product64((short*)primary_synchro_time_nr[pss_source]+(frame_parms->ofdm_symbol_size),
(short*) &(rxdata[0][peak_position+is*frame_parms->samples_per_frame])+frame_parms->ofdm_symbol_size,
frame_parms->ofdm_symbol_size>>1,
shift);
int64_t re1,re2,im1,im2;
re1=((int*) &result1)[0];
re2=((int*) &result2)[0];
im1=((int*) &result1)[1];
im2=((int*) &result2)[1];
// estimation of fractional frequency offset: angle[(result1)'*(result2)]/pi
ffo_est=atan2(re1*im2-re2*im1,re1*re2+im1*im2)/M_PI;
// Computing cross-correlation at peak on half the symbol size for first half of data
c32_t r1 = dot_product((c16_t *)primary_synchro_time_nr[pss_source], (c16_t *)&(rxdata[0][peak_position + is * frame_parms->samples_per_frame]), frame_parms->ofdm_symbol_size >> 1, shift);
// Computing cross-correlation at peak on half the symbol size for data shifted by half symbol size
// as it is real and complex it is necessary to shift by a value equal to symbol size to obtain such shift
c32_t r2 = dot_product((c16_t *)primary_synchro_time_nr[pss_source] + (frame_parms->ofdm_symbol_size >> 1),
(c16_t *)&(rxdata[0][peak_position + is * frame_parms->samples_per_frame]) + (frame_parms->ofdm_symbol_size >> 1),
frame_parms->ofdm_symbol_size >> 1,
shift);
cd_t r1d = {r1.r, r1.i}, r2d = {r2.r, r2.i};
// estimation of fractional frequency offset: angle[(result1)'*(result2)]/pi
ffo_est = atan2(r1d.r * r2d.i - r2d.r * r1d.i, r1d.r * r2d.r + r1d.i * r2d.i) / M_PI;
#ifdef DBG_PSS_NR
printf("ffo %lf\n",ffo_est);
......
......@@ -29,256 +29,51 @@ void print_ints(char *s,__m128i *x);
void print_shorts(char *s,__m128i *x);
void print_bytes(char *s,__m128i *x);
#endif
int32_t dot_product(int16_t *x,
int16_t *y,
uint32_t N, //must be a multiple of 8
uint8_t output_shift)
{
uint32_t n;
#if defined(__x86_64__) || defined(__i386__)
__m128i *x128,*y128,mmtmp1,mmtmp2,mmtmp3,mmcumul,mmcumul_re,mmcumul_im;
__m64 mmtmp7;
__m128i minus_i = _mm_set_epi16(-1,1,-1,1,-1,1,-1,1);
int32_t result;
x128 = (__m128i*) x;
y128 = (__m128i*) y;
mmcumul_re = _mm_setzero_si128();
mmcumul_im = _mm_setzero_si128();
for (n=0; n<(N>>2); n++) {
//printf("n=%d, x128=%p, y128=%p\n",n,x128,y128);
// print_shorts("x",&x128[0]);
// print_shorts("y",&y128[0]);
// this computes Re(z) = Re(x)*Re(y) + Im(x)*Im(y)
mmtmp1 = _mm_madd_epi16(x128[0],y128[0]);
// print_ints("re",&mmtmp1);
// mmtmp1 contains real part of 4 consecutive outputs (32-bit)
// shift and accumulate results
mmtmp1 = _mm_srai_epi32(mmtmp1,output_shift);
mmcumul_re = _mm_add_epi32(mmcumul_re,mmtmp1);
// print_ints("re",&mmcumul_re);
// this computes Im(z) = Re(x)*Im(y) - Re(y)*Im(x)
mmtmp2 = _mm_shufflelo_epi16(y128[0],_MM_SHUFFLE(2,3,0,1));
// print_shorts("y",&mmtmp2);
mmtmp2 = _mm_shufflehi_epi16(mmtmp2,_MM_SHUFFLE(2,3,0,1));
// print_shorts("y",&mmtmp2);
mmtmp2 = _mm_sign_epi16(mmtmp2,minus_i);
// print_shorts("y",&mmtmp2);
mmtmp3 = _mm_madd_epi16(x128[0],mmtmp2);
// print_ints("im",&mmtmp3);
// mmtmp3 contains imag part of 4 consecutive outputs (32-bit)
// shift and accumulate results
mmtmp3 = _mm_srai_epi32(mmtmp3,output_shift);
mmcumul_im = _mm_add_epi32(mmcumul_im,mmtmp3);
// print_ints("im",&mmcumul_im);
x128++;
y128++;
}
// this gives Re Re Im Im
mmcumul = _mm_hadd_epi32(mmcumul_re,mmcumul_im);
// print_ints("cumul1",&mmcumul);
// this gives Re Im Re Im
mmcumul = _mm_hadd_epi32(mmcumul,mmcumul);
// print_ints("cumul2",&mmcumul);
//mmcumul = _mm_srai_epi32(mmcumul,output_shift);
// extract the lower half
mmtmp7 = _mm_movepi64_pi64(mmcumul);
// print_ints("mmtmp7",&mmtmp7);
// pack the result
mmtmp7 = _mm_packs_pi32(mmtmp7,mmtmp7);
// print_shorts("mmtmp7",&mmtmp7);
// convert back to integer
result = _mm_cvtsi64_si32(mmtmp7);
_mm_empty();
_m_empty();
return(result);
#elif defined(__arm__) || defined(__aarch64__)
int16x4_t *x_128=(int16x4_t*)x;
int16x4_t *y_128=(int16x4_t*)y;
int32x4_t tmp_re,tmp_im;
int32x4_t tmp_re1,tmp_im1;
int32x4_t re_cumul,im_cumul;
int32x2_t re_cumul2,im_cumul2;
int32x4_t shift = vdupq_n_s32(-output_shift);
int32x2x2_t result2;
int16_t conjug[4]__attribute__((aligned(16))) = {-1,1,-1,1} ;
re_cumul = vdupq_n_s32(0);
im_cumul = vdupq_n_s32(0);
for (n=0; n<(N>>2); n++) {
tmp_re = vmull_s16(*x_128++, *y_128++);
//tmp_re = [Re(x[0])Re(y[0]) Im(x[0])Im(y[0]) Re(x[1])Re(y[1]) Im(x[1])Im(y[1])]
tmp_re1 = vmull_s16(*x_128++, *y_128++);
//tmp_re1 = [Re(x1[1])Re(x2[1]) Im(x1[1])Im(x2[1]) Re(x1[1])Re(x2[2]) Im(x1[1])Im(x2[2])]
tmp_re = vcombine_s32(vpadd_s32(vget_low_s32(tmp_re),vget_high_s32(tmp_re)),
vpadd_s32(vget_low_s32(tmp_re1),vget_high_s32(tmp_re1)));
//tmp_re = [Re(ch[0])Re(rx[0])+Im(ch[0])Im(ch[0]) Re(ch[1])Re(rx[1])+Im(ch[1])Im(ch[1]) Re(ch[2])Re(rx[2])+Im(ch[2]) Im(ch[2]) Re(ch[3])Re(rx[3])+Im(ch[3])Im(ch[3])]
tmp_im = vmull_s16(vrev32_s16(vmul_s16(*x_128++,*(int16x4_t*)conjug)),*y_128++);
//tmp_im = [-Im(ch[0])Re(rx[0]) Re(ch[0])Im(rx[0]) -Im(ch[1])Re(rx[1]) Re(ch[1])Im(rx[1])]
tmp_im1 = vmull_s16(vrev32_s16(vmul_s16(*x_128++,*(int16x4_t*)conjug)),*y_128++);
//tmp_im1 = [-Im(ch[2])Re(rx[2]) Re(ch[2])Im(rx[2]) -Im(ch[3])Re(rx[3]) Re(ch[3])Im(rx[3])]
tmp_im = vcombine_s32(vpadd_s32(vget_low_s32(tmp_im),vget_high_s32(tmp_im)),
vpadd_s32(vget_low_s32(tmp_im1),vget_high_s32(tmp_im1)));
//tmp_im = [-Im(ch[0])Re(rx[0])+Re(ch[0])Im(rx[0]) -Im(ch[1])Re(rx[1])+Re(ch[1])Im(rx[1]) -Im(ch[2])Re(rx[2])+Re(ch[2])Im(rx[2]) -Im(ch[3])Re(rx[3])+Re(ch[3])Im(rx[3])]
re_cumul = vqaddq_s32(re_cumul,vqshlq_s32(tmp_re,shift));
im_cumul = vqaddq_s32(im_cumul,vqshlq_s32(tmp_im,shift));
}
re_cumul2 = vpadd_s32(vget_low_s32(re_cumul),vget_high_s32(re_cumul));
im_cumul2 = vpadd_s32(vget_low_s32(im_cumul),vget_high_s32(im_cumul));
re_cumul2 = vpadd_s32(re_cumul2,re_cumul2);
im_cumul2 = vpadd_s32(im_cumul2,im_cumul2);
result2 = vzip_s32(re_cumul2,im_cumul2);
return(vget_lane_s32(result2.val[0],0));
#endif
}
int64_t dot_product64(int16_t *x,
int16_t *y,
uint32_t N, //must be a multiple of 8
uint8_t output_shift)
/*! \brief Complex number dot_product
WARNING: the OAI historical name is dot_product but it is not: it is sum(x*y) not, sum(x*conjugate(y))
*/
c32_t dot_product(const c16_t *x,//! input vector
const c16_t *y,//! input vector
const uint32_t N,//! size of the vectors
const int output_shift //! normalization afer int16 multiplication
)
{
uint32_t n;
#if defined(__x86_64__) || defined(__i386__)
__m128i *x128,*y128,mmtmp1,mmtmp2,mmtmp3,mmcumul,mmcumul_re,mmcumul_im;
__m128i minus_i = _mm_set_epi16(-1,1,-1,1,-1,1,-1,1);
int64_t result;
x128 = (__m128i*) x;
y128 = (__m128i*) y;
mmcumul_re = _mm_setzero_si128();
mmcumul_im = _mm_setzero_si128();
for (n=0; n<(N>>2); n++) {
// printf("n=%d, x128=%p, y128=%p\n",n,x128,y128);
// print_shorts("x",&x128[0]);
// print_shorts("y",&y128[0]);
// this computes Re(z) = Re(x)*Re(y) + Im(x)*Im(y)
mmtmp1 = _mm_madd_epi16(x128[0],y128[0]);
// print_ints("retmp",&mmtmp1);
// mmtmp1 contains real part of 4 consecutive outputs (32-bit)
// shift and accumulate results
mmtmp1 = _mm_srai_epi32(mmtmp1,output_shift);
mmcumul_re = _mm_add_epi32(mmcumul_re,mmtmp1);
//print_ints("re",&mmcumul_re);
// this computes Im(z) = Re(x)*Im(y) - Re(y)*Im(x)
mmtmp2 = _mm_shufflelo_epi16(y128[0],_MM_SHUFFLE(2,3,0,1));
//print_shorts("y",&mmtmp2);
mmtmp2 = _mm_shufflehi_epi16(mmtmp2,_MM_SHUFFLE(2,3,0,1));
//print_shorts("y",&mmtmp2);
mmtmp2 = _mm_sign_epi16(mmtmp2,minus_i);
// print_shorts("y",&mmtmp2);
mmtmp3 = _mm_madd_epi16(x128[0],mmtmp2);
//print_ints("imtmp",&mmtmp3);
// mmtmp3 contains imag part of 4 consecutive outputs (32-bit)
// shift and accumulate results
mmtmp3 = _mm_srai_epi32(mmtmp3,output_shift);
mmcumul_im = _mm_add_epi32(mmcumul_im,mmtmp3);
//print_ints("im",&mmcumul_im);
x128++;
y128++;
const int16_t reflip[32] __attribute__((aligned(32))) = {1, -1, 1, -1, 1, -1, 1, -1, 1, -1, 1, -1, 1, -1, 1, -1};
const int8_t imshuffle[64] __attribute__((aligned(32))) = {2, 3, 0, 1, 6, 7, 4, 5, 10, 11, 8, 9, 14, 15, 12, 13, 18, 19, 16, 17, 22, 23, 20, 21, 26, 27, 24, 25, 30, 31, 28, 29};
const c16_t *end = x + N;
__m256i cumul_re = {0}, cumul_im = {0};
while (x < end) {
const __m256i in1 = simde_mm256_loadu_si256((__m256i *)x);
const __m256i in2 = simde_mm256_loadu_si256((__m256i *)y);
const __m256i tmpRe = simde_mm256_madd_epi16(in1, in2);
cumul_re = simde_mm256_add_epi32(cumul_re, simde_mm256_srai_epi32(tmpRe, output_shift));
const __m256i tmp1 = simde_mm256_shuffle_epi8(in2, *(__m256i *)imshuffle);
const __m256i tmp2 = simde_mm256_sign_epi16(tmp1, *(__m256i *)reflip);
const __m256i tmpIm = simde_mm256_madd_epi16(in1, tmp2);
cumul_im = simde_mm256_add_epi32(cumul_im, simde_mm256_srai_epi32(tmpIm, output_shift));
x += 8;
y += 8;
}
// this gives Re Re Im Im
mmcumul = _mm_hadd_epi32(mmcumul_re,mmcumul_im);
//print_ints("cumul1",&mmcumul);
// this gives Re Im Re Im
mmcumul = _mm_hadd_epi32(mmcumul,mmcumul);
//print_ints("cumul2",&mmcumul);
//mmcumul = _mm_srai_epi32(mmcumul,output_shift);
// extract the lower half
result = _mm_extract_epi64(mmcumul,0);
//printf("result: (%d,%d)\n",((int32_t*)&result)[0],((int32_t*)&result)[1]);
_mm_empty();
_m_empty();
return(result);
#elif defined(__arm__) || defined(__aarch64__)
int16x4_t *x_128=(int16x4_t*)x;
int16x4_t *y_128=(int16x4_t*)y;
int32x4_t tmp_re,tmp_im;
int32x4_t tmp_re1,tmp_im1;
int32x4_t re_cumul,im_cumul;
int32x2_t re_cumul2,im_cumul2;
int32x4_t shift = vdupq_n_s32(-output_shift);
int32x2x2_t result2;
int16_t conjug[4]__attribute__((aligned(16))) = {-1,1,-1,1} ;
re_cumul = vdupq_n_s32(0);
im_cumul = vdupq_n_s32(0);
for (n=0; n<(N>>2); n++) {
tmp_re = vmull_s16(*x_128++, *y_128++);
//tmp_re = [Re(x[0])Re(y[0]) Im(x[0])Im(y[0]) Re(x[1])Re(y[1]) Im(x[1])Im(y[1])]
tmp_re1 = vmull_s16(*x_128++, *y_128++);
//tmp_re1 = [Re(x1[1])Re(x2[1]) Im(x1[1])Im(x2[1]) Re(x1[1])Re(x2[2]) Im(x1[1])Im(x2[2])]
tmp_re = vcombine_s32(vpadd_s32(vget_low_s32(tmp_re),vget_high_s32(tmp_re)),
vpadd_s32(vget_low_s32(tmp_re1),vget_high_s32(tmp_re1)));
//tmp_re = [Re(ch[0])Re(rx[0])+Im(ch[0])Im(ch[0]) Re(ch[1])Re(rx[1])+Im(ch[1])Im(ch[1]) Re(ch[2])Re(rx[2])+Im(ch[2]) Im(ch[2]) Re(ch[3])Re(rx[3])+Im(ch[3])Im(ch[3])]
tmp_im = vmull_s16(vrev32_s16(vmul_s16(*x_128++,*(int16x4_t*)conjug)),*y_128++);
//tmp_im = [-Im(ch[0])Re(rx[0]) Re(ch[0])Im(rx[0]) -Im(ch[1])Re(rx[1]) Re(ch[1])Im(rx[1])]
tmp_im1 = vmull_s16(vrev32_s16(vmul_s16(*x_128++,*(int16x4_t*)conjug)),*y_128++);
//tmp_im1 = [-Im(ch[2])Re(rx[2]) Re(ch[2])Im(rx[2]) -Im(ch[3])Re(rx[3]) Re(ch[3])Im(rx[3])]
tmp_im = vcombine_s32(vpadd_s32(vget_low_s32(tmp_im),vget_high_s32(tmp_im)),
vpadd_s32(vget_low_s32(tmp_im1),vget_high_s32(tmp_im1)));
//tmp_im = [-Im(ch[0])Re(rx[0])+Re(ch[0])Im(rx[0]) -Im(ch[1])Re(rx[1])+Re(ch[1])Im(rx[1]) -Im(ch[2])Re(rx[2])+Re(ch[2])Im(rx[2]) -Im(ch[3])Re(rx[3])+Re(ch[3])Im(rx[3])]
re_cumul = vqaddq_s32(re_cumul,vqshlq_s32(tmp_re,shift));
im_cumul = vqaddq_s32(im_cumul,vqshlq_s32(tmp_im,shift));
// this gives Re Re Im Im Re Re Im Im
const __m256i cumulTmp = simde_mm256_hadd_epi32(cumul_re, cumul_im);
const __m256i cumul = simde_mm256_hadd_epi32(cumulTmp, cumulTmp);
c32_t ret;
ret.r = simde_mm256_extract_epi32(cumul, 0) + simde_mm256_extract_epi32(cumul, 4);
ret.i = simde_mm256_extract_epi32(cumul, 1) + simde_mm256_extract_epi32(cumul, 5);
if (x!=end) {
x-=8;
y-=8;
for ( ; x <end; x++,y++ ) {
ret.r += ((x->r*y->r)>>output_shift) + ((x->i*y->i)>>output_shift);
ret.i += ((x->r*y->i)>>output_shift) - ((x->i*y->r)>>output_shift);
}
}
re_cumul2 = vpadd_s32(vget_low_s32(re_cumul),vget_high_s32(re_cumul));
im_cumul2 = vpadd_s32(vget_low_s32(im_cumul),vget_high_s32(im_cumul));
re_cumul2 = vpadd_s32(re_cumul2,re_cumul2);
im_cumul2 = vpadd_s32(im_cumul2,im_cumul2);
result2 = vzip_s32(re_cumul2,im_cumul2);
return(vget_lane_s32(result2.val[0],0));
#endif
return ret;
}
#ifdef MAIN
void print_bytes(char *s,__m128i *x)
{
......
......@@ -742,16 +742,10 @@ int32_t phy_phase_compensation_top(uint32_t pilot_type,
uint32_t last_pilot,
int32_t ignore_prefix);
int32_t dot_product(int16_t *x,
int16_t *y,
uint32_t N, //must be a multiple of 8
uint8_t output_shift);
int64_t dot_product64(int16_t *x,
int16_t *y,
uint32_t N, //must be a multiple of 8
uint8_t output_shift);
c32_t dot_product(const c16_t *x,
const c16_t *y,
const uint32_t N, // must be a multiple of 8
const int output_shift);
/** @} */
......
Markdown is supported
0%
or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment