fixing a bug in cmult_vv, removing unused code.

89849444 · Florian Kaltenberger · 78913e26 · 78913e26 · 89849444 · 78913e26
Commit 89849444 authored Feb 19, 2016 by Florian Kaltenberger
5 changed files
--- a/openair1/PHY/TOOLS/cmult_mm.c
+++ b/openair1/PHY/TOOLS/cmult_mm.c
-/*******************************************************************************
-    OpenAirInterface
-    Copyright(c) 1999 - 2014 Eurecom
-
-    OpenAirInterface is free software: you can redistribute it and/or modify
-    it under the terms of the GNU General Public License as published by
-    the Free Software Foundation, either version 3 of the License, or
-    (at your option) any later version.
-
-
-    OpenAirInterface is distributed in the hope that it will be useful,
-    but WITHOUT ANY WARRANTY; without even the implied warranty of
-    MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
-    GNU General Public License for more details.
-
-    You should have received a copy of the GNU General Public License
-    along with OpenAirInterface.The full GNU General Public License is
-   included in this distribution in the file called "COPYING". If not,
-   see <http://www.gnu.org/licenses/>.
-
-  Contact Information
-  OpenAirInterface Admin: openair_admin@eurecom.fr
-  OpenAirInterface Tech : openair_tech@eurecom.fr
-  OpenAirInterface Dev  : openair4g-devel@lists.eurecom.fr
-
-  Address      : Eurecom, Campus SophiaTech, 450 Route des Chappes, CS 50193 - 06904 Biot Sophia Antipolis cedex, FRANCE
-
- *******************************************************************************/
-#include "defs.h"
-
-int mult_cpx_matrix_h(short *x1[2][2],
-                      short *x2[2][2],
-                      short *y[2][2],
-                      unsigned int N,
-                      unsigned short output_shift,
-                      short hermitian)
-{
-
-  if (hermitian) {
-    // this computes x1^H*x2 and stores it in y
-    mult_cpx_vector_h(x2[0][0],x1[0][0],y[0][0],N,output_shift,1);
-    mult_cpx_vector_h(x2[0][1],x1[0][1],y[0][0],N,output_shift,1);
-    mult_cpx_vector_h(x2[0][0],x1[1][0],y[1][0],N,output_shift,1);
-    mult_cpx_vector_h(x2[0][1],x1[1][1],y[1][0],N,output_shift,1);
-    mult_cpx_vector_h(x2[1][0],x1[0][0],y[0][1],N,output_shift,1);
-    mult_cpx_vector_h(x2[1][1],x1[0][1],y[0][1],N,output_shift,1);
-    mult_cpx_vector_h(x2[1][0],x1[1][0],y[1][1],N,output_shift,1);
-    mult_cpx_vector_h(x2[1][1],x1[1][1],y[1][1],N,output_shift,1);
-  } else {
-    // this computes x1*x2^H and stores it in y
-    mult_cpx_vector_h(x1[0][0],x2[0][0],y[0][0],N,output_shift,1);
-    mult_cpx_vector_h(x1[0][1],x2[0][1],y[0][0],N,output_shift,1);
-    mult_cpx_vector_h(x1[0][0],x2[1][0],y[0][1],N,output_shift,1);
-    mult_cpx_vector_h(x1[0][1],x2[1][1],y[0][1],N,output_shift,1);
-    mult_cpx_vector_h(x1[1][0],x2[0][0],y[1][0],N,output_shift,1);
-    mult_cpx_vector_h(x1[1][1],x2[0][1],y[1][0],N,output_shift,1);
-    mult_cpx_vector_h(x1[1][0],x2[1][0],y[1][1],N,output_shift,1);
-    mult_cpx_vector_h(x1[1][1],x2[1][1],y[1][1],N,output_shift,1);
-  }
-}
-
-int mult_cpx_matrix_vector(int *x1[2][2],
-                           int *x2[2],
-                           int *y[2],
-                           unsigned int N,
-                           unsigned short output_shift)
-{
-
-  Zero_Buffer(y[0],N*8);
-  Zero_Buffer(y[1],N*8);
-
-  // this computes x1*x2 and stores it in y (32 bit)
-  mult_cpx_vector_add32((short*)x2[0],(short*)x1[0][0],(short*)y[0],N);
-  mult_cpx_vector_add32((short*)x2[1],(short*)x1[0][1],(short*)y[0],N);
-  mult_cpx_vector_add32((short*)x2[0],(short*)x1[1][0],(short*)y[1],N);
-  mult_cpx_vector_add32((short*)x2[1],(short*)x1[1][1],(short*)y[1],N);
-
-  // shift and pack
-  shift_and_pack((short*)y[0],N,output_shift);
-  shift_and_pack((short*)y[1],N,output_shift);
-
-}
-
-#ifdef MAIN_MM
-#include <stdio.h>
-#include <stdlib.h>
-main ()
-{
-  short x1_00[256] __attribute__((aligned(16)));
-  short x1_10[256] __attribute__((aligned(16)));
-  short x1_01[256] __attribute__((aligned(16)));
-  short x1_11[256] __attribute__((aligned(16)));
-  short x2_0[256] __attribute__((aligned(16)));
-  short x2_1[256] __attribute__((aligned(16)));
-  short y_0[256] __attribute__((aligned(16)));
-  short y_1[256] __attribute__((aligned(16)));
-
-  int *x1[2][2];
-  int *x2[2];
-  int *y[2];
-  int i,m,n;
-
-  x1[0][0] = (int*)x1_00;
-  x1[0][1] = (int*)x1_01;
-  x1[1][0] = (int*)x1_10;
-  x1[1][1] = (int*)x1_11;
-  x2[0] = (int*)x2_0;
-  x2[1] = (int*)x2_1;
-  y[0] = (int*)y_0;
-  y[1] = (int*)y_1;
-
-  for(m=0; m<2; m++) {
-    for(n=0; n<2; n++) {
-      for(i=0; i<256; i+=4) {
-        ((short*)x1[m][n])[i] = ((short) rand())/4;
-        ((short*)x1[m][n])[i+1] = ((short) rand())/4;
-        ((short*)x1[m][n])[i+2] = -((short*)x1[m][n])[i+1];
-        ((short*)x1[m][n])[i+3] = ((short*)x1[m][n])[i];
-      }
-    }
-
-    for(i=0; i<256; i+=4) {
-      ((short*)x2[m])[i] = ((short) rand())/4;
-      ((short*)x2[m])[i+1] = ((short) rand())/4;
-      ((short*)x2[m])[i+2] = ((short*)x2[m])[i];
-      ((short*)x2[m])[i+3] = ((short*)x2[m])[i+1];
-    }
-
-    Zero_Buffer(y[m],512);
-  }
-
-  /*
-  input[0] = 100;
-  input[1] = 200;
-  input[2] = -200;
-  input[3] = 100;
-  input[4] = 1000;
-  input[5] = 2000;
-  input[6] = -2000;
-  input[7] = 1000;
-  input[8] = 100;
-  input[9] = 200;
-  input[10] = -200;
-  input[11] = 100;
-  input[12] = 1000;
-  input[13] = 2000;
-  input[14] = -2000;
-  input[15] = 1000;
-
-  input2[0] = 2;
-  input2[1] = 1;
-  input2[2] = 2;
-  input2[3] = 1;
-  input2[4] = 20;
-  input2[5] = 10;
-  input2[6] = 20;
-  input2[7] = 10;
-  input2[8] = 2;
-  input2[9] = 1;
-  input2[10] = 2;
-  input2[11] = 1;
-  input2[12] = 2000;
-  input2[13] = 1000;
-  input2[14] = 2000;
-  input2[15] = 1000;
-
-  x1[0][0] = (int*)input;
-  x1[0][1] = (int*)input;
-  x1[1][0] = (int*)input;
-  x1[1][1] = (int*)input;
-
-  x2[0] = (int*)input2;
-  x2[1] = (int*)input2;
-
-  y[0] = (int*)output;
-  y[1] = (int*)output2;
-  */
-
-  mult_cpx_matrix_vector(x1,x2,y,64,15);
-
-  //mult_cpx_vector_add32(x2[0],x1[0][0],y[0],64);
-
-  for (i=0; i<128; i+=2)
-    printf("i=%d, x1 = [%d+1i*%d %d+1i*%d; %d+1i*%d %d+1i*%d]; x2 = [%d+1i*%d; %d+1i*%d]; y = [%d+1i*%d; %d+1i*%d]; y_m= round(x1*x2./pow2(15)); y-y_m \n",
-           i,
-           ((short*)x1[0][0])[2*i],  ((short*)x1[0][0])[2*i+2],
-           ((short*)x1[0][1])[2*i],  ((short*)x1[0][1])[2*i+2],
-           ((short*)x1[1][0])[2*i],  ((short*)x1[1][0])[2*i+2],
-           ((short*)x1[1][1])[2*i],  ((short*)x1[1][1])[2*i+2],
-           ((short*)x2[0])[2*i],  ((short*)x2[0])[2*i+1],
-           ((short*)x2[1])[2*i],  ((short*)x2[1])[2*i+1],
-           ((short*)y[0])[2*i],  ((short*)y[0])[2*i+1],
-           ((short*)y[1])[2*i],  ((short*)y[1])[2*i+1]);
-
-  //((int*)y[0])[i],  ((int*)y[0])[i+1],
-  //((int*)y[1])[i],  ((int*)y[1])[i+1]);
-
-}
-
-#endif
--- a/openair1/PHY/TOOLS/cmult_vv.c
+++ b/openair1/PHY/TOOLS/cmult_vv.c
@@ -50,7 +50,7 @@ int mult_cpx_conj_vector(int16_t *x1,
                         uint32_t N,
                         int output_shift)
 {
-  // Multiply elementwise two complex vectors of N elements with repeated formatted output
+  // Multiply elementwise the complex conjugate of x1 with x2. 
  // x1       - input 1    in the format  |Re0 Im0 Re1 Im1|,......,|Re(N-2)  Im(N-2) Re(N-1) Im(N-1)|
  //            We assume x1 with a dinamic of 15 bit maximum
  //
@@ -90,7 +90,7 @@ int mult_cpx_conj_vector(int16_t *x1,
    tmp_im = _mm_shufflelo_epi16(*x1_128,_MM_SHUFFLE(2,3,0,1));
    tmp_im = _mm_shufflehi_epi16(tmp_im,_MM_SHUFFLE(2,3,0,1));
    tmp_im = _mm_sign_epi16(tmp_im,*(__m128i*)&conjug[0]);
-    tmp_im = _mm_madd_epi16(tmp_im,*x1_128);
+    tmp_im = _mm_madd_epi16(tmp_im,*x2_128);
    tmp_re = _mm_srai_epi32(tmp_re,output_shift);
    tmp_im = _mm_srai_epi32(tmp_im,output_shift);
    tmpy0  = _mm_unpacklo_epi32(tmp_re,tmp_im);
@@ -130,3 +130,4 @@ int mult_cpx_conj_vector(int16_t *x1,

  return(0);
 }
+
--- a/openair1/PHY/TOOLS/cmult_vvh.c
+++ b/openair1/PHY/TOOLS/cmult_vvh.c
-/*******************************************************************************
-    OpenAirInterface
-    Copyright(c) 1999 - 2014 Eurecom
-
-    OpenAirInterface is free software: you can redistribute it and/or modify
-    it under the terms of the GNU General Public License as published by
-    the Free Software Foundation, either version 3 of the License, or
-    (at your option) any later version.
-
-
-    OpenAirInterface is distributed in the hope that it will be useful,
-    but WITHOUT ANY WARRANTY; without even the implied warranty of
-    MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
-    GNU General Public License for more details.
-
-    You should have received a copy of the GNU General Public License
-    along with OpenAirInterface.The full GNU General Public License is
-   included in this distribution in the file called "COPYING". If not,
-   see <http://www.gnu.org/licenses/>.
-
-  Contact Information
-  OpenAirInterface Admin: openair_admin@eurecom.fr
-  OpenAirInterface Tech : openair_tech@eurecom.fr
-  OpenAirInterface Dev  : openair4g-devel@lists.eurecom.fr
-
-  Address      : Eurecom, Campus SophiaTech, 450 Route des Chappes, CS 50193 - 06904 Biot Sophia Antipolis cedex, FRANCE
-
- *******************************************************************************/
-#include "defs.h"
-
-static  __m128i shift __attribute__ ((aligned(16)));
-
-
-int mult_cpx_vector_h(short *x1,
-                      short *x2,
-                      short *y,
-                      unsigned int N,
-                      unsigned short output_shift,
-                      short sign)
-{
-  // Multiply elementwise the complex vector x1 with the complex conjugate of the complex vecotr x2 of N elements and adds it to the vector y.
-  // x1       - input 1    in the format  |Re0  Im0 Re0 Im0|,......,|Re(N-1)  Im(N-1) Re(N-1) Im(N-1)|
-  //            We assume x1 with a dinamic of 15 bit maximum
-  //
-  // x2       - input 2    in the format  |Re0 Im0 Re0 Im0|,......,|Re(N-1) Im(N-1) Re(N-1) Im(N-1)|
-  //            We assume x2 with a dinamic of 14 bit maximum
-  //
-  // y        - output     in the format  |Re0  Im0 Re0 Im0|,......,|Re(N-1)  Im(N-1) Re(N-1) Im(N-1)|
-  //
-  // N        - the size f the vectors (this function does N cpx mpy. WARNING: N>=4;
-  //
-  // log2_amp - increase the output amplitude by a factor 2^log2_amp (default is 0)
-  //            WARNING: log2_amp>0 can cause overflow!!
-  // sign     - +1..add, -1..substract
-
-  unsigned int i;                 // loop counter
-
-  register __m128i m0,m1,m2;
-
-  short *temps;
-  int *tempd;
-
-  __m128i *x1_128;
-  __m128i *x2_128;
-  __m128i *y_128;
-  __m128i mask;
-
-  __m128i temp;
-
-  shift = _mm_cvtsi32_si128(output_shift);
-  x1_128 = (__m128i *)&x1[0];
-  x2_128 = (__m128i *)&x2[0];
-  y_128 = (__m128i *)&y[0];
-
-  if (sign == -1)
-    mask = (__m128i) _mm_set_epi16 (-1,1,-1,-1,-1,1,-1,-1);
-  else
-    mask = (__m128i) _mm_set_epi16 (1,-1,1,1,1,-1,1,1);
-
-  // we compute 2*4 cpx multiply for each loop
-  for(i=0; i<(N>>3); i++) {
-
-    //    printf("i=%d\n",i);
-
-    // unroll 1
-    //    temps = (short *)x1_128;
-    //    printf("x1 : %d,%d,%d,%d,%d,%d,%d,%d\n",temps[0],temps[1],temps[2],temps[3],temps[4],temps[5],temps[6],temps[7]);
-    m1 = x1_128[0];
-    m2 = x2_128[0];
-
-    //    temps = (short *)&x2_128[0];
-    //    printf("x2 : %x,%x,%x,%x,%x,%x,%x,%x\n",temps[0],temps[1],temps[2],temps[3],temps[4],temps[5],temps[6],temps[7]);
-
-    // bring x2 in conjugate form
-    // the first two instructions might be replaced with a single one in SSE3
-    m2 = _mm_shufflelo_epi16(m2,_MM_SHUFFLE(0,1,3,2));
-    m2 = _mm_shufflehi_epi16(m2,_MM_SHUFFLE(0,1,3,2));
-    m2 = _mm_mullo_epi16(m2, mask);
-
-    //    temp = m2;
-    //    temps = (short *)&temp;
-    //    printf("x2 conj : %x,%x,%x,%x,%x,%x,%x,%x\n",temps[0],temps[1],temps[2],temps[3],temps[4],temps[5],temps[6],temps[7]);
-
-    m0 = _mm_madd_epi16(m1,m2); //pmaddwd_r2r(mm1,mm0);         // 1- compute x1[0]*x2[0]
-
-    //    temp = m0;
-
-    //    tempd = &temp;
-    //    printf("m0 : %d,%d,%d,%d\n",tempd[0],tempd[1],tempd[2],tempd[3]);
-
-    m0 = _mm_sra_epi32(m0,shift);        // 1- shift right by shift in order to  compensate for the input amplitude
-
-    //    temp = m0;
-
-    //    tempd = (int *)&temp;
-    //  printf("m0 : %d,%d,%d,%d\n",tempd[0],tempd[1],tempd[2],tempd[3]);
-
-    m0 = _mm_packs_epi32(m0,m0);        // 1- pack in a 128 bit register [re im re im]
-    m0 = _mm_unpacklo_epi32(m0,m0);        // 1- pack in a 128 bit register [re im re im]
-
-    y_128[0] = _mm_add_epi16(m0,y_128[0]);
-
-    //    temps = (short *)&y_128[0];
-    //    printf("y0 : %d,%d,%d,%d,%d,%d,%d,%d\n",temps[0],temps[1],temps[2],temps[3],temps[4],temps[5],temps[6],temps[7]);
-
-    // unroll 2
-    m1 = x1_128[1];
-    m2 = x2_128[1];
-
-    m2 = _mm_shufflelo_epi16(m2,_MM_SHUFFLE(0,1,3,2));
-    m2 = _mm_shufflehi_epi16(m2,_MM_SHUFFLE(0,1,3,2));
-    m2 = _mm_mullo_epi16(m2, mask);
-
-    m0 = _mm_madd_epi16(m1,m2); //pmaddwd_r2r(mm1,mm0);         // 1- compute x1[0]*x2[0]
-
-    m0 = _mm_sra_epi32(m0,shift);        // 1- shift right by shift in order to  compensate for the input amplitude
-
-    m0 = _mm_packs_epi32(m0,m0);        // 1- pack in a 128 bit register [re im re im]
-    m0 = _mm_unpacklo_epi32(m0,m0);        // 1- pack in a 128 bit register [re im re im]
-
-    y_128[1] = _mm_add_epi16(m0,y_128[1]);
-
-    // unroll 3
-    m1 = x1_128[2];
-    m2 = x2_128[2];
-
-    m2 = _mm_shufflelo_epi16(m2,_MM_SHUFFLE(0,1,3,2));
-    m2 = _mm_shufflehi_epi16(m2,_MM_SHUFFLE(0,1,3,2));
-    m2 = _mm_mullo_epi16(m2, mask);
-
-    m0 = _mm_madd_epi16(m1,m2); //pmaddwd_r2r(mm1,mm0);         // 1- compute x1[0]*x2[0]
-
-    m0 = _mm_sra_epi32(m0,shift);        // 1- shift right by shift in order to  compensate for the input amplitude
-
-    m0 = _mm_packs_epi32(m0,m0);        // 1- pack in a 128 bit register [re im re im]
-    m0 = _mm_unpacklo_epi32(m0,m0);        // 1- pack in a 128 bit register [re im re im]
-
-    y_128[2] = _mm_add_epi16(m0,y_128[2]);
-
-
-    // unroll 4
-    m1 = x1_128[3];
-    m2 = x2_128[3];
-
-    m2 = _mm_shufflelo_epi16(m2,_MM_SHUFFLE(0,1,3,2));
-    m2 = _mm_shufflehi_epi16(m2,_MM_SHUFFLE(0,1,3,2));
-    m2 = _mm_mullo_epi16(m2, mask);
-
-    m0 = _mm_madd_epi16(m1,m2); //pmaddwd_r2r(mm1,mm0);         // 1- compute x1[0]*x2[0]
-
-    m0 = _mm_sra_epi32(m0,shift);        // 1- shift right by shift in order to  compensate for the input amplitude
-
-    m0 = _mm_packs_epi32(m0,m0);        // 1- pack in a 128 bit register [re im re im]
-    m0 = _mm_unpacklo_epi32(m0,m0);        // 1- pack in a 128 bit register [re im re im]
-
-    y_128[3] = _mm_add_epi16(m0,y_128[3]);
-
-    x1_128+=4;
-    x2_128+=4;
-    y_128 +=4;
-    //    printf("x1_128 = %p, x2_128 =%p,  y_128=%p\n",x1_128,x2_128,y_128);
-
-  }
-
-
-  _mm_empty();
-  _m_empty();
-
-  return(0);
-}
-
-
-int mult_cpx_vector_h_add32(short *x1,
-                            short *x2,
-                            short *y,
-                            unsigned int N,
-                            short sign)
-{
-  // Multiply elementwise the complex vector x1 with the complex conjugate of the complex vecotr x2 of N elements and adds it to the vector y.
-  // x1       - input 1    in 16bit format  |Re0  Im0 Re0 Im0|,......,|Re(N-1)  Im(N-1) Re(N-1) Im(N-1)|
-  //            We assume x1 with a dinamic of 15 bit maximum
-  //
-  // x2       - input 2    in 16bit format  |Re0 Im0 Re0 Im0|,......,|Re(N-1) Im(N-1) Re(N-1) Im(N-1)|
-  //            We assume x2 with a dinamic of 14 bit maximum
-  //
-  // y        - output     in 32bit format  |Re0  Im0|,......,|Re(N-1)  Im(N-1)|
-  //
-  // N        - the size f the vectors (this function does N cpx mpy. WARNING: N>=4;
-  //
-  // sign     - +1..add, -1..substract
-
-  unsigned int i;                 // loop counter
-
-  register __m128i m0,m1,m2;
-
-  short *temps;
-  int *tempd;
-
-  __m128i *x1_128;
-  __m128i *x2_128;
-  __m128i *y_128;
-  __m128i mask;
-
-  __m128i temp;
-
-  x1_128 = (__m128i *)&x1[0];
-  x2_128 = (__m128i *)&x2[0];
-  y_128 = (__m128i *)&y[0];
-
-  if (sign == -1)
-    mask = (__m128i) _mm_set_epi16 (-1,1,-1,-1,-1,1,-1,-1);
-  else
-    mask = (__m128i) _mm_set_epi16 (1,-1,1,1,1,-1,1,1);
-
-  // we compute 2*4 cpx multiply for each loop
-  for(i=0; i<(N>>3); i++) {
-
-    m1 = x1_128[0];
-    m2 = x2_128[0];
-
-    // bring x2 in conjugate form
-    // the first two instructions might be replaced with a single one in SSE3
-    m2 = _mm_shufflelo_epi16(m2,_MM_SHUFFLE(0,1,3,2));
-    m2 = _mm_shufflehi_epi16(m2,_MM_SHUFFLE(0,1,3,2));
-    m2 = _mm_mullo_epi16(m2, mask);
-
-    m0 = _mm_madd_epi16(m1,m2);         // 1- compute x1[0]*x2[0], result is 32bit
-
-    y_128[0] = _mm_add_epi32(m0,y_128[0]);
-
-    // unroll 2
-    m1 = x1_128[1];
-    m2 = x2_128[1];
-
-    m2 = _mm_shufflelo_epi16(m2,_MM_SHUFFLE(0,1,3,2));
-    m2 = _mm_shufflehi_epi16(m2,_MM_SHUFFLE(0,1,3,2));
-    m2 = _mm_mullo_epi16(m2, mask);
-
-    m0 = _mm_madd_epi16(m1,m2);
-
-    y_128[1] = _mm_add_epi32(m0,y_128[1]);
-
-    // unroll 3
-    m1 = x1_128[2];
-    m2 = x2_128[2];
-
-    m2 = _mm_shufflelo_epi16(m2,_MM_SHUFFLE(0,1,3,2));
-    m2 = _mm_shufflehi_epi16(m2,_MM_SHUFFLE(0,1,3,2));
-    m2 = _mm_mullo_epi16(m2, mask);
-
-    m0 = _mm_madd_epi16(m1,m2);
-
-    y_128[2] = _mm_add_epi32(m0,y_128[2]);
-
-
-    // unroll 4
-    m1 = x1_128[3];
-    m2 = x2_128[3];
-
-    m2 = _mm_shufflelo_epi16(m2,_MM_SHUFFLE(0,1,3,2));
-    m2 = _mm_shufflehi_epi16(m2,_MM_SHUFFLE(0,1,3,2));
-    m2 = _mm_mullo_epi16(m2, mask);
-
-    m0 = _mm_madd_epi16(m1,m2);
-
-    y_128[3] = _mm_add_epi32(m0,y_128[3]);
-
-    x1_128+=4;
-    x2_128+=4;
-    y_128 +=4;
-
-  }
-
-
-  _mm_empty();
-  _m_empty();
-
-  return(0);
-}
-
-#ifdef MAIN
-#define L 16
-
-main ()
-{
-
-  short input[256] __attribute__((aligned(16)));
-  short input2[256] __attribute__((aligned(16)));
-  short output[256] __attribute__((aligned(16)));
-
-  int i;
-
-  Zero_Buffer(output,256*2);
-
-  for (i=0; i<16; i+=2)
-    printf("output[%d] = %d + %d i\n",i,output[i],output[i+1]);
-
-  input[0] = 100;
-  input[1] = 200;
-  input[2] = 100;
-  input[3] = 200;
-  input[4] = 1234;
-  input[5] = -1234;
-  input[6] = 1234;
-  input[7] = -1234;
-  input[8] = 100;
-  input[9] = 200;
-  input[10] = 100;
-  input[11] = 200;
-  input[12] = 1000;
-  input[13] = 2000;
-  input[14] = 1000;
-  input[15] = 2000;
-
-  input2[0] = 1;
-  input2[1] = 2;
-  input2[2] = 1;
-  input2[3] = 2;
-  input2[4] = 10;
-  input2[5] = 20;
-  input2[6] = 10;
-  input2[7] = 20;
-  input2[8] = 1;
-  input2[9] = 2;
-  input2[10] = 1;
-  input2[11] = 2;
-  input2[12] = 1000;
-  input2[13] = 2000;
-  input2[14] = 1000;
-  input2[15] = 2000;
-
-
-  mult_cpx_vector_h(input2,input2,output,8,0,1);
-
-  for (i=0; i<16; i+=2)
-    printf("output[%d] = %d + %d i\n",i,output[i],output[i+1]);
-
-  Zero_Buffer(output,256*2);
-  mult_cpx_vector_h(input2,input2,output,8,0,-1);
-
-  for (i=0; i<16; i+=2)
-    printf("output[%d] = %d + %d i\n",i,output[i],output[i+1]);
-
-}
-
-#endif //MAIN
--- a/openair1/PHY/TOOLS/defs.h
+++ b/openair1/PHY/TOOLS/defs.h
@@ -64,6 +64,8 @@ struct complex32 {
  int32_t i;
 };

+//cmult_sv.h
+
 /*!\fn void multadd_real_vector_complex_scalar(int16_t *x,int16_t *alpha,int16_t *y,uint32_t N)
 This function performs componentwise multiplication and accumulation of a complex scalar and a real vector.
 @param x Vector input (Q1.15)
@@ -95,192 +97,14 @@ void multadd_complex_vector_real_scalar(int16_t *x,
                                        uint8_t zero_flag,
                                        uint32_t N);

-
-/*!\fn int32_t mult_cpx_vector(int16_t *x1,int16_t *x2,int16_t *y,uint32_t N,int32_t output_shift)
-This function performs optimized componentwise multiplication of two Q1.15 vectors in repeated format.
-
-@param x1 Input 1 in the format  |Re0  Im0 Re0 Im0|,......,|Re(N-1)  Im(N-1) Re(N-1) Im(N-1)|
-@param x2 Input 2 in the format  |Re0 -Im0 Im0 Re0|,......,|Re(N-1) -Im(N-1) Im(N-1) Re(N-1)|
-@param y  Output in the format  |Re0  Im0 Re0 Im0|,......,|Re(N-1)  Im(N-1) Re(N-1) Im(N-1)|
-@param N  Length of Vector WARNING: N must be a multiple of 8 (4x loop unrolling and two complex multiplies per cycle)
-@param output_shift Number of bits to shift output down to Q1.15 (should be 15 for Q1.15 inputs) WARNING: log2_amp>0 can cause overflow!!
-
-The function implemented is : \f$\mathbf{y} = \mathbf{x_1}\odot\mathbf{x_2}\f$
-*/
-int32_t mult_cpx_vector(int16_t *x1,
-                        int16_t *x2,
-                        int16_t *y,
-                        uint32_t N,
-                        int32_t output_shift);
-
-/*!\fn int32_t mult_cpx_vector_norep(int16_t *x1,int16_t *x2,int16_t *y,uint32_t N,int32_t output_shift)
-This function performs optimized componentwise multiplication of two Q1.15 vectors with normal formatted output.
-
-@param x1 Input 1 in the format  |Re0  Im0 Re0 Im0|,......,|Re(N-1)  Im(N-1) Re(N-1) Im(N-1)|
-@param x2 Input 2 in the format  |Re0 -Im0 Im0 Re0|,......,|Re(N-1) -Im(N-1) Im(N-1) Re(N-1)|
-@param y  Output in the format  |Re0  Im0 Re1 Im1|,......,|Re(N-2)  Im(N-2) Re(N-1) Im(N-1)|
-@param N  Length of Vector WARNING: N must be a multiple of 8 (4x loop unrolling and two complex multiplies per cycle)
-@param output_shift Number of bits to shift output down to Q1.15 (should be 15 for Q1.15 inputs) WARNING: log2_amp>0 can cause overflow!!
-
-The function implemented is : \f$\mathbf{y} = \mathbf{x_1}\odot\mathbf{x_2}\f$
-*/
-int32_t mult_cpx_vector_norep(int16_t *x1,
-                              int16_t *x2,
-                              int16_t *y,
-                              uint32_t N,
-                              int32_t output_shift);
-
-
-/*!\fn int32_t mult_cpx_vector_norep2(int16_t *x1,int16_t *x2,int16_t *y,uint32_t N,int32_t output_shift)
-This function performs optimized componentwise multiplication of two Q1.15 vectors with normal formatted output.
-
-@param x1 Input 1 in the format  |Re0  Im0 Re0 Im0|,......,|Re(N-1)  Im(N-1) Re(N-1) Im(N-1)|
-@param x2 Input 2 in the format  |Re0 -Im0 Im0 Re0|,......,|Re(N-1) -Im(N-1) Im(N-1) Re(N-1)|
-@param y  Output in the format  |Re0  Im0 Re1 Im1|,......,|Re(N-2)  Im(N-2) Re(N-1) Im(N-1)|
-@param N  Length of Vector WARNING: N must be a multiple of 8 (no unrolling and two complex multiplies per cycle)
-@param output_shift Number of bits to shift output down to Q1.15 (should be 15 for Q1.15 inputs) WARNING: log2_amp>0 can cause overflow!!
-
-The function implemented is : \f$\mathbf{y} = \mathbf{x_1}\odot\mathbf{x_2}\f$
-*/
-int32_t mult_cpx_vector_norep2(int16_t *x1,
-                               int16_t *x2,
-                               int16_t *y,
-                               uint32_t N,
-                               int32_t output_shift);
-
-int32_t mult_cpx_vector_norep_conj(int16_t *x1,
-                                   int16_t *x2,
-                                   int16_t *y,
-                                   uint32_t N,
-                                   int32_t output_shift);
-
-int32_t mult_cpx_vector_norep_conj2(int16_t *x1,
-                                    int16_t *x2,
-                                    int16_t *y,
-                                    uint32_t N,
-                                    int32_t output_shift);
-
-/*!\fn int32_t mult_cpx_vector2(int16_t *x1,int16_t *x2,int16_t *y,uint32_t N,int32_t output_shift)
-This function performs optimized componentwise multiplication of two Q1.15 vectors.
-
-@param x1 Input 1 in the format  |Re0  Im0 Re0 Im0|,......,|Re(N-1)  Im(N-1) Re(N-1) Im(N-1)|
-@param x2 Input 2 in the format  |Re0 -Im0 Im0 Re0|,......,|Re(N-1) -Im(N-1) Im(N-1) Re(N-1)|
-@param y  Output in the format  |Re0  Im0 Re0 Im0|,......,|Re(N-1)  Im(N-1) Re(N-1) Im(N-1)|
-@param N  Length of Vector WARNING: N must be a multiple of 2 (2 complex multiplies per cycle)
-@param output_shift Number of bits to shift output down to Q1.15 (should be 15 for Q1.15 inputs) WARNING: log2_amp>0 can cause overflow!!
-
-The function implemented is : \f$\mathbf{y} = \mathbf{x_1}\odot\mathbf{x_2}\f$
-*/
-int32_t mult_cpx_vector2(int16_t *x1,
-                         int16_t *x2,
-                         int16_t *y,
-                         uint32_t N,
-                         int32_t output_shift);
-
-/*!\fn int32_t mult_cpx_vector_add(int16_t *x1,int16_t *x2,int16_t *y,uint32_t N,int32_t output_shift)
-This function performs optimized componentwise multiplication of two Q1.15 vectors. The output IS ADDED TO y. WARNING: make sure that output_shift is set to the right value, so that the result of the multiplication has the comma at the same place as y.
-
-@param x1 Input 1 in the format  |Re0  Im0 Re0 Im0|,......,|Re(N-1)  Im(N-1) Re(N-1) Im(N-1)|
-@param x2 Input 2 in the format  |Re0 -Im0 Im0 Re0|,......,|Re(N-1) -Im(N-1) Im(N-1) Re(N-1)|
-@param y  Output in the format  |Re0  Im0 Re0 Im0|,......,|Re(N-1)  Im(N-1) Re(N-1) Im(N-1)|
-@param N  Length of Vector WARNING: N>=4
-@param output_shift Number of bits to shift output down to Q1.15 (should be 15 for Q1.15 inputs) WARNING: log2_amp>0 can cause overflow!!
-
-The function implemented is : \f$\mathbf{y} += \mathbf{x_1}\odot\mathbf{x_2}\f$
-*/
-
-int32_t mult_cpx_vector_add(int16_t *x1,
-                            int16_t *x2,
-                            int16_t *y,
-                            uint32_t N,
-                            int32_t output_shift);
-
-
-int32_t mult_cpx_vector_h_add32(int16_t *x1,
-                                int16_t *x2,
-                                int16_t *y,
-                                uint32_t N,
-                                int16_t sign);
-
-int32_t mult_cpx_vector_add32(int16_t *x1,
-                              int16_t *x2,
-                              int16_t *y,
-                              uint32_t N);
-
-int32_t mult_vector32(int16_t *x1,
-                      int16_t *x2,
+int rotate_cpx_vector(int16_t *x,
+                      int16_t *alpha,
                      int16_t *y,
-                      uint32_t N);
-
-int32_t mult_vector32_scalar(int16_t *x1,
-                             int32_t x2,
-                             int16_t *y,
-                             uint32_t N);
+                      uint32_t N,
+                      uint16_t output_shift);

-int32_t mult_cpx_vector32_conj(int16_t *x,
-                               int16_t *y,
-                               uint32_t N);

-int32_t mult_cpx_vector32_real(int16_t *x1,
-                               int16_t *x2,
-                               int16_t *y,
-                               uint32_t N);

-int32_t shift_and_pack(int16_t *y,
-                       uint32_t N,
-                       int32_t output_shift);
-
-/*!\fn int32_t mult_cpx_vector_h(int16_t *x1,int16_t *x2,int16_t *y,uint32_t N,int32_t output_shift,int16_t sign)
-This function performs optimized componentwise multiplication of the vector x1 with the conjugate of the vector x2. The output IS ADDED TO y. WARNING: make sure that output_shift is set to the right value, so that the result of the multiplication has the comma at the same place as y.
-
-@param x1 Input 1 in the format  |Re0  Im0 Re0 Im0|,......,|Re(N-1)  Im(N-1) Re(N-1) Im(N-1)|
-@param x2 Input 2 in the format  |Re0  Im0 Re0 Im0|,......,|Re(N-1) Im(N-1) Re(N-1) Im(N-1)|
-@param y  Output in the format  |Re0  Im0 Re0 Im0|,......,|Re(N-1)  Im(N-1) Re(N-1) Im(N-1)|
-@param N  Length of Vector (complex samples) WARNING: N>=4
-@param output_shift Number of bits to shift output down to Q1.15 (should be 15 for Q1.15 inputs) WARNING: log2_amp>0 can cause overflow!!
-@param sign +1..add, -1..substract
-
-The function implemented is : \f$\mathbf{y} = \mathbf{y} + \mathbf{x_1}\odot\mathbf{x_2}^*\f$
-*/
-int32_t mult_cpx_vector_h(int16_t *x1,
-                          int16_t *x2,
-                          int16_t *y,
-                          uint32_t N,
-                          int32_t output_shift,
-                          int16_t sign);
-
-/*!\fn int32_t mult_cpx_matrix_h(int16_t *x1[2][2],int16_t *x2[2][2],int16_t *y[2][2],uint32_t N,uint16_t output_shift,int16_t hermitian)
-This function performs optimized componentwise matrix multiplication of the 2x2 matrices x1 with the 2x2 matrices x2. The output IS ADDED TO y (i.e. make sure y is initilized correctly). WARNING: make sure that output_shift is set to the right value, so that the result of the multiplication has the comma at the same place as y.
-
-@param x1 Input 1 in the format  |Re0  Im0 Re0 Im0|,......,|Re(N-1)  Im(N-1) Re(N-1) Im(N-1)|
-@param x2 Input 2 in the format  |Re0  Im0 Re0 Im0|,......,|Re(N-1) Im(N-1) Re(N-1) Im(N-1)|
-@param y  Output in the format  |Re0  Im0 Re0 Im0|,......,|Re(N-1)  Im(N-1) Re(N-1) Im(N-1)|
-@param N  Length of Vector (complex samples) WARNING: N>=4
-@param output_shift Number of bits to shift output down to Q1.15 (should be 15 for Q1.15 inputs) WARNING: log2_amp>0 can cause overflow!!
-@param hermitian if !=0 the hermitian transpose is returned (i.e. A^H*B instead of A*B^H)
-*/
-int32_t mult_cpx_matrix_h(int16_t *x1[2][2],
-                          int16_t *x2[2][2],
-                          int16_t *y[2][2],
-                          uint32_t N,
-                          uint16_t output_shift,
-                          int16_t hermitian);
-
-
-/*!\fn int32_t mult_cpx_matrix_vector(int32_t *x1[2][2],int32_t *x2[2],int32_t *y[2],uint32_t N,uint16_t output_shift)
-This function performs optimized componentwise matrix-vector multiplication of the 2x2 matrices x1 with the 2x1 vectors x2. The output IS ADDED TO y (i.e. make sure y is initilized correctly). WARNING: make sure that output_shift is set to the right value, so that the result of the multiplication has the comma at the same place as y.
-
-@param x1 Input 1 in the format  |Re0  Im0 Re0 Im0|,......,|Re(N-1)  Im(N-1) Re(N-1) Im(N-1)|
-@param x2 Input 2 in the format  |Re0  -Im0 Im0 Re0|,......,|Re(N-1) -Im(N-1) Im(N-1) Re(N-1)|
-@param y  Output in the format  |Re0  Im0 Re0 Im0|,......,|Re(N-1)  Im(N-1) Re(N-1) Im(N-1)| WARNING: y must be different for x2
-@param N  Length of Vector (complex samples) WARNING: N>=4
-@param output_shift Number of bits to shift output down to Q1.15 (should be 15 for Q1.15 inputs) WARNING: log2_amp>0 can cause overflow!!
-*/
-int32_t mult_cpx_matrix_vector(int32_t *x1[2][2],
-                               int32_t *x2[2],
-                               int32_t *y[2],
-                               uint32_t N,
-                               uint16_t output_shift);

 /*!\fn void init_fft(uint16_t size,uint8_t logsize,uint16_t *rev)
 \brief Initialize the FFT engine for a given size
@@ -289,6 +113,25 @@ int32_t mult_cpx_matrix_vector(int32_t *x1[2][2],
 @param rev Pointer to bit-reversal permutation array
 */

+//cmult_vv.c
+/*!
+  Multiply elementwise the complex conjugate of x1 with x2. 
+  @param x1       - input 1    in the format  |Re0 Im0 Re1 Im1|,......,|Re(N-2)  Im(N-2) Re(N-1) Im(N-1)|
+              We assume x1 with a dinamic of 15 bit maximum
+  @param x2       - input 2    in the format  |Re0 Im0 Re1 Im1|,......,|Re(N-2)  Im(N-2) Re(N-1) Im(N-1)|
+              We assume x2 with a dinamic of 14 bit maximum
+  @param y        - output     in the format  |Re0 Im0 Re1 Im1|,......,|Re(N-2)  Im(N-2) Re(N-1) Im(N-1)|
+  @param N        - the size f the vectors (this function does N cpx mpy. WARNING: N>=4;
+  @param output_shift  - shift to be applied to generate output
+*/
+
+int mult_cpx_conj_vector(int16_t *x1,
+                         int16_t *x2,
+                         int16_t *y,
+                         uint32_t N,
+                         int output_shift);
+
+// lte_dfts.c
 void init_fft(uint16_t size,
              uint8_t logsize,
              uint16_t *rev);
@@ -355,6 +198,7 @@ int32_t rotate_cpx_vector(int16_t *x,
                          uint16_t output_shift);


+//cadd_sv.c

 /*!\fn int32_t add_cpx_vector(int16_t *x,int16_t *alpha,int16_t *y,uint32_t N)
 This function performs componentwise addition of a vector with a complex scalar.

--- a/openair1/PHY/TOOLS/time_meas.h
+++ b/openair1/PHY/TOOLS/time_meas.h
@@ -26,6 +26,9 @@
  Address      : Eurecom, Campus SophiaTech, 450 Route des Chappes, CS 50193 - 06904 Biot Sophia Antipolis cedex, FRANCE

 *******************************************************************************/
+#ifndef __TIME_MEAS_DEFS__H__
+#define __TIME_MEAS_DEFS__H__
+
 #include <unistd.h>
 #include <math.h>
 #include <stdint.h>
@@ -148,3 +151,4 @@ static inline void copy_meas(time_stats_t *dst_ts,time_stats_t *src_ts)
    dst_ts->max=src_ts->max;
  }
 }
+#endif