new PRACH channel using AVX instructions

e7b2b644 · lfarizav · 2e2bc97b · e7b2b644 · e7b2b644 · e7b2b644
Commit e7b2b644 authored Jun 14, 2018 by lfarizav
4 changed files
--- a/openair1/SIMULATION/TOOLS/abstraction.c
+++ b/openair1/SIMULATION/TOOLS/abstraction.c
@@ -291,7 +291,7 @@ int init_freq_channel_AVX_float(channel_desc_t *desc,uint16_t nb_rb,int16_t n_sa
      else
        delay = desc->delays[l]+NB_SAMPLES_CHANNEL_OFFSET/desc->sampling_rate;
-        sincos256_ps(_mm256_set_ps(twopi*(4*f+7)*delay,twopi*(4*f+6)*delay,twopi*(4*f+5)*delay,twopi*(4*f+4)*delay,twopi*(4*f+3)*delay,twopi*(4*f+2)*delay,twopi*(4*f+1)*delay,twopi*(4*f)*delay), &sin_lut256, &cos_lut256);
+        sincos256_ps(_mm256_set_ps(twopi*(8*f+7)*delay,twopi*(8*f+6)*delay,twopi*(8*f+5)*delay,twopi*(8*f+4)*delay,twopi*(8*f+3)*delay,twopi*(8*f+2)*delay,twopi*(8*f+1)*delay,twopi*(8*f)*delay), &sin_lut256, &cos_lut256);
        //cos_lut256=_mm256_set_ps(cos(twopi*(8*f+7)*delay),cos(twopi*(8*f+6)*delay),cos(twopi*(8*f+5)*delay),cos(twopi*(8*f+4)*delay),cos(twopi*(8*f+3)*delay),cos(twopi*(8*f+2)*delay),cos(twopi*(8*f+1)*delay),cos(twopi*(8*f)*delay));
        //sin_lut256=_mm256_set_ps(sin(twopi*(8*f+7)*delay),sin(twopi*(8*f+6)*delay),sin(twopi*(8*f+5)*delay),sin(twopi*(8*f+4)*delay),sin(twopi*(8*f+3)*delay),sin(twopi*(8*f+2)*delay),sin(twopi*(8*f+1)*delay),sin(twopi*(8*f)*delay));
        _mm256_storeu_ps(&cos_lut_f[l][8*f+(n_samples>>1)],cos_lut256);
@@ -580,7 +580,74 @@ int init_freq_channel_prach_SSE_float(channel_desc_t *desc,uint16_t nb_rb,int16_
  return(0);
 }
+int init_freq_channel_prach_AVX_float(channel_desc_t *desc,uint16_t nb_rb,int16_t n_samples,int16_t prach_fmt,int16_t n_ra_prb)
+{
+  static int first_run=1;
+  float delta_f,twopi;  // 90 kHz spacing
+  float delay;
+  int16_t f,f1;
+  uint8_t l;
+  int prach_samples, prach_pbr_offset_samples, max_nb_rb_samples;
+  __m256 cos_lut256,sin_lut256, cos_256, sin_256;
+  if ((n_samples&1)==0) {
+    fprintf(stderr, "freq_channel_init: n_samples has to be odd\n");
+    return(-1); 
+  }
+  if (nb_rb-n_ra_prb<6) {
+    fprintf(stderr, "freq_channel_init: Impossible to allocate PRACH, check n_ra_prb value (r_ra_prb=%d)\n",n_ra_prb);
+    return(-1); 
+  }
+  prach_samples = (prach_fmt<4)?13+839+12:3+139+2;
+  if (first_run)
+  {
+	cos_lut_f_prach = (float **)malloc16(prach_samples*sizeof(float*));
+	sin_lut_f_prach = (float **)malloc16(prach_samples*sizeof(float*));
+        for (f=max_nb_rb_samples/2-prach_pbr_offset_samples,f1=0; f<max_nb_rb_samples/2-prach_pbr_offset_samples+prach_samples; f++,f1++) {
+	    cos_lut_f_prach[f1] = (float *)malloc16_clear((int)desc->nb_taps*sizeof(float));
+	    sin_lut_f_prach[f1] = (float *)malloc16_clear((int)desc->nb_taps*sizeof(float));
+	}
+	first_run=0;
+  }
+  //cos_lut = (double **)malloc(prach_samples*sizeof(double*));
+  //sin_lut = (double **)malloc(prach_samples*sizeof(double*));
+  delta_f = (prach_fmt<4)?nb_rb*180000/((n_samples-1)*12):nb_rb*180000/((n_samples-1)*2);//1.25 khz for preamble format 1,2,3. 7.5 khz for preample format 4
+  max_nb_rb_samples = nb_rb*180000/delta_f;//7200 if prach_fmt<4
+  prach_pbr_offset_samples = (n_ra_prb+6)*180000/delta_f;//864 if n_ra_prb=0,7200 if n_ra_prb=44=50-6
+  twopi=2*M_PI*1e-6*delta_f;
+  //printf("prach_samples = %d, delta_f = %e, max_nb_rb_samples= %d, prach_pbr_offset_samples = %d, nb_taps = %d\n",prach_samples,delta_f,max_nb_rb_samples,prach_pbr_offset_samples,desc->nb_taps);
+  for (f=((max_nb_rb_samples/2-prach_pbr_offset_samples)>>3),f1=0; f<((max_nb_rb_samples/2-prach_pbr_offset_samples+prach_samples)>>3); f++,f1++) {//3600-864,3600-864+864|3600-7200,3600-7200+839
+    //freq=delta_f*(float)f*1e-6;// due to the fact that delays is in mus
+    //printf("[init_freq_channel_prach] freq %e\n",freq);
+    //cos_lut[f1] = (double *)malloc((int)desc->nb_taps*sizeof(double));
+    //sin_lut[f1] = (double *)malloc((int)desc->nb_taps*sizeof(double));
+    for (l=0; l<(int)desc->nb_taps; l++) {
+      if (desc->nb_taps==1)
+        delay = desc->delays[l];
+      else
+        delay = desc->delays[l]+NB_SAMPLES_CHANNEL_OFFSET/desc->sampling_rate;
+      //cos_lut256=_mm256_set_ps(cos(twopi*(8*f+3)*delay),cos(twopi*(8*f+2)*delay),cos(twopi*(8*f+1)*delay),cos(twopi*(8*f)*delay));
+      //sin_lut256=_mm256_set_ps(sin(twopi*(8*f+3)*delay),sin(twopi*(8*f+2)*delay),sin(twopi*(8*f+1)*delay),sin(twopi*(8*f)*delay));
+      sincos256_ps(_mm256_set_ps(twopi*(8*f+7)*delay,twopi*(8*f+6)*delay,twopi*(8*f+5)*delay,twopi*(8*f+4)*delay,twopi*(8*f+3)*delay,twopi*(8*f+2)*delay,twopi*(8*f+1)*delay,twopi*(8*f)*delay),&sin_256,&cos_256);
+      cos_lut256=cos_256;
+      sin_lut256=sin_256;
+      _mm256_storeu_ps(&cos_lut_f_prach[l][8*f1],cos_lut256);
+      _mm256_storeu_ps(&sin_lut_f_prach[l][8*f1],sin_lut256);
+      //cos_lut[f1][l] = cos(2*M_PI*freq*delay);
+      //sin_lut[f1][l] = sin(2*M_PI*freq*delay);
+      //if (f<max_nb_rb_samples/2-prach_pbr_offset_samples+10)
+      	//printf("freq: %e, f1: %d, f: %d, arg_sin_cos = %e,  cos () = %e, sin () =n %e)\n",freq, f1,f, 2*M_PI*freq*delay, cos_lut[f1][l], sin_lut[f1][l]);
+    }
+  }
+  return(0);
+}
 int freq_channel_prach(channel_desc_t *desc,uint16_t nb_rb,int16_t n_samples,int16_t prach_fmt,int16_t n_ra_prb)
 {
@@ -688,6 +755,62 @@ int freq_channel_prach_SSE_float(channel_desc_t *desc,uint16_t nb_rb,int16_t n_s
  stop_meas(&desc->interp_freq_PRACH);
  return(0);
 }
+int freq_channel_prach_AVX_float(channel_desc_t *desc,uint16_t nb_rb,int16_t n_samples,int16_t prach_fmt,int16_t n_ra_prb)
+{
+  int16_t f;
+  uint8_t aarx,aatx,l;
+  int prach_samples;
+  static int freq_channel_init=0;
+  static int n_samples_max=0;
+  __m256 chFx_256,chFy_256;
+  prach_samples = (prach_fmt<4)?13+839+12:3+139+2; 
+  // do some error checking
+  if (nb_rb-n_ra_prb<6) {
+    fprintf(stderr, "freq_channel_init: Impossible to allocate PRACH, check r_ra_prb value (r_ra_prb=%d)\n",n_ra_prb);
+    return(-1); 
+  }
+  if (freq_channel_init == 0) {
+    // we are initializing the lut for the largets possible n_samples=12*nb_rb+1
+    // if called with n_samples<12*nb_rb+1, we decimate the lut
+    n_samples_max=12*nb_rb+1;
+    if (init_freq_channel_prach_AVX_float(desc,nb_rb,n_samples_max,prach_fmt,n_ra_prb)==0)
+      freq_channel_init=1;
+    else
+      return(-1);
+  }
+  start_meas(&desc->interp_freq_PRACH);
+  for (f=0; f<(prach_samples>>3); f++) {
+    //clut = cos_lut[f];
+    //slut = sin_lut[f];
+    for (aarx=0; aarx<desc->nb_rx; aarx++) {
+      for (aatx=0; aatx<desc->nb_tx; aatx++) {
+        //desc->chF_prach[aarx+(aatx*desc->nb_rx)].x[f]=0.0;
+        //desc->chF_prach[aarx+(aatx*desc->nb_rx)].y[f]=0.0;
+	chFx_256=_mm256_setzero_ps();
+	chFy_256=_mm256_setzero_ps();
+        for (l=0; l<(int)desc->nb_taps; l++) {
+          //desc->chF_prach[aarx+(aatx*desc->nb_rx)].x[f]+=(desc->a[l][aarx+(aatx*desc->nb_rx)].x*cos_lut_f_prach[l][f]+
+          //    desc->a[l][aarx+(aatx*desc->nb_rx)].y*sin_lut_f_prach[l][f]);
+          //desc->chF_prach[aarx+(aatx*desc->nb_rx)].y[f]+=(-desc->a[l][aarx+(aatx*desc->nb_rx)].x*sin_lut_f_prach[l][f]+
+          //    desc->a[l][aarx+(aatx*desc->nb_rx)].y*cos_lut_f_prach[l][f]);
+	  chFx_256=_mm256_add_ps(chFx_256,_mm256_add_ps(_mm256_mul_ps(_mm256_set1_ps(desc->a[l][aarx+(aatx*desc->nb_rx)].x),_mm256_loadu_ps(&cos_lut_f_prach[l][8*f])),_mm256_mul_ps(_mm256_set1_ps(desc->a[l][aarx+(aatx*desc->nb_rx)].y),_mm256_loadu_ps(&sin_lut_f_prach[l][8*f]))));  
+	  chFy_256=_mm256_add_ps(chFy_256,_mm256_sub_ps(_mm256_mul_ps(_mm256_set1_ps(desc->a[l][aarx+(aatx*desc->nb_rx)].y),_mm256_loadu_ps(&cos_lut_f_prach[l][8*f])),_mm256_mul_ps(_mm256_set1_ps(desc->a[l][aarx+(aatx*desc->nb_rx)].x),_mm256_loadu_ps(&sin_lut_f_prach[l][8*f]))));  
+        }
+	_mm256_storeu_ps(&desc->chF_prach[aarx+(aatx*desc->nb_rx)].x[8*f],chFx_256);
+	_mm256_storeu_ps(&desc->chF_prach[aarx+(aatx*desc->nb_rx)].y[8*f],chFy_256);
+      }
+    }
+	//if (f<10 || (f>829&&f<839))
+	//	printf("chF_prach[0][%d], (x,y) = (%e,%e)\n",f,desc->chF_prach[0][f].x,desc->chF_prach[0][f].y);
+  }
+  stop_meas(&desc->interp_freq_PRACH);
+  return(0);
+}
 //#endif
 double compute_pbch_sinr(channel_desc_t *desc,
                         channel_desc_t *desc_i1,

--- a/openair1/SIMULATION/TOOLS/defs.h
+++ b/openair1/SIMULATION/TOOLS/defs.h
@@ -510,11 +510,13 @@ int freq_channel_SSE_float(channel_desc_t *desc,uint16_t nb_rb, int16_t n_sample
 int freq_channel_AVX_float(channel_desc_t *desc,uint16_t nb_rb, int16_t n_samples);
 int freq_channel_prach(channel_desc_t *desc,uint16_t nb_rb,int16_t n_samples,int16_t prach_fmt,int16_t n_ra_prb);
 int freq_channel_prach_SSE_float(channel_desc_t *desc,uint16_t nb_rb,int16_t n_samples,int16_t prach_fmt,int16_t n_ra_prb);
+int freq_channel_prach_AVX_float(channel_desc_t *desc,uint16_t nb_rb,int16_t n_samples,int16_t prach_fmt,int16_t n_ra_prb);
 int init_freq_channel(channel_desc_t *desc,uint16_t nb_rb,int16_t n_samples);
 int init_freq_channel_SSE_float(channel_desc_t *desc,uint16_t nb_rb,int16_t n_samples);
 int init_freq_channel_AVX_float(channel_desc_t *desc,uint16_t nb_rb,int16_t n_samples);
 int init_freq_channel_prach(channel_desc_t *desc,uint16_t nb_rb,int16_t n_samples,int16_t prach_fmt,int16_t n_ra_prb);
 int init_freq_channel_prach_SSE_float(channel_desc_t *desc,uint16_t nb_rb,int16_t n_samples,int16_t prach_fmt,int16_t n_ra_prb);
+int init_freq_channel_prach_AVX_float(channel_desc_t *desc,uint16_t nb_rb,int16_t n_samples,int16_t prach_fmt,int16_t n_ra_prb);
 void sincos_ps(__m128 x, __m128 *s, __m128 *c);
 void sincos256_ps(__m256 x, __m256 *s, __m256 *c);

--- a/openair1/SIMULATION/TOOLS/multipath_channel.c
+++ b/openair1/SIMULATION/TOOLS/multipath_channel.c
@@ -951,7 +951,7 @@ void multipath_channel_prach_AVX_float(channel_desc_t *desc,
 		// do nothing - keep channel
 		} else {
 		random_channel_freq(desc,0);
-		freq_channel_prach_SSE_float(desc,nb_rb,n_samples,prach_fmt,n_ra_prb);//Find desc->chF_prach
+		freq_channel_prach_AVX_float(desc,nb_rb,n_samples,prach_fmt,n_ra_prb);//Find desc->chF_prach
 		}	
 			for (f=0;f<(length>>3); f++) {
 				//rx_tmp.x = 0;

--- a/targets/SIMU/USER/channel_sim.c
+++ b/targets/SIMU/USER/channel_sim.c