avx2 and better simd algo

bf219fc2 · Laurent Thomas · laurent · 6d030ea2 · bf219fc2 · bf219fc2
Commit bf219fc2 authored Jun 16, 2022 by Laurent Thomas Committed by laurent Jun 27, 2022
Hide whitespace changes
Inline Side-by-side

Showing with 31 additions and 205 deletions

openair1/PHY/TOOLS/cmult_sv.c openair1/PHY/TOOLS/cmult_sv.c +29 -195

openair1/PHY/TOOLS/tools_defs.h openair1/PHY/TOOLS/tools_defs.h +2 -10

No files found.
--- a/openair1/PHY/TOOLS/cmult_sv.c
+++ b/openair1/PHY/TOOLS/cmult_sv.c
@@ -144,205 +144,39 @@ void multadd_real_four_symbols_vector_complex_scalar(int16_t *x,
  _m_empty();
 }
+#ifdef __AVX2__
-/*
+void rotate_cpx_vector(int16_t *x,
-int rotate_cpx_vector(int16_t *x,
                      int16_t *alpha,
                      int16_t *y,
                      uint32_t N,
-                      uint16_t output_shift,
+                      uint16_t output_shift)
-                      uint8_t format)
-{
-  // Multiply elementwise two complex vectors of N elements
-  // x        - input 1    in the format  |Re0  Im0 Re0 Im0|,......,|Re(N-1)  Im(N-1) Re(N-1) Im(N-1)|
-  //            We assume x1 with a dynamic of 15 bit maximum
-  //
-  // alpha      - input 2    in the format  |Re0 Im0|
-  //            We assume x2 with a dynamic of 15 bit maximum
-  //
-  // y        - output     in the format  |Re0  Im0 Re0 Im0|,......,|Re(N-1)  Im(N-1) Re(N-1) Im(N-1)|
-  //
-  // N        - the size f the vectors (this function does N cpx mpy. WARNING: N>=4;
-  //
-  // output_shift - shift at output to return in Q1.15
-  // format - 0 means alpha is in shuffled format, 1 means x is in shuffled format
-  uint32_t i;                 // loop counter
-  register __m128i m0,m1;
-  __m128i *x_128;
-  __m128i *y_128;
-  shift = _mm_cvtsi32_si128(output_shift);
-  x_128 = (__m128i *)&x[0];
-  if (format==0) {  // alpha is in shuffled format for complex multiply
-    ((int16_t *)&alpha_128)[0] = alpha[0];
-    ((int16_t *)&alpha_128)[1] = -alpha[1];
-    ((int16_t *)&alpha_128)[2] = alpha[1];
-    ((int16_t *)&alpha_128)[3] = alpha[0];
-    ((int16_t *)&alpha_128)[4] = alpha[0];
-    ((int16_t *)&alpha_128)[5] = -alpha[1];
-    ((int16_t *)&alpha_128)[6] = alpha[1];
-    ((int16_t *)&alpha_128)[7] = alpha[0];
-  } else { // input is in shuffled format for complex multiply
-    ((int16_t *)&alpha_128)[0] = alpha[0];
-    ((int16_t *)&alpha_128)[1] = alpha[1];
-    ((int16_t *)&alpha_128)[2] = alpha[0];
-    ((int16_t *)&alpha_128)[3] = alpha[1];
-    ((int16_t *)&alpha_128)[4] = alpha[0];
-    ((int16_t *)&alpha_128)[5] = alpha[1];
-    ((int16_t *)&alpha_128)[6] = alpha[0];
-    ((int16_t *)&alpha_128)[7] = alpha[1];
-  }
-  y_128 = (__m128i *)&y[0];
-  //  _mm_empty();
-  //  return(0);
-  // we compute 4 cpx multiply for each loop
-  for(i=0; i<(N>>3); i++) {
-    m0 = _mm_madd_epi16(x_128[0],alpha_128); //pmaddwd_r2r(mm1,mm0);         // 1- compute x1[0]*x2[0]
-    m0 = _mm_sra_epi32(m0,shift);        // 1- shift right by shift in order to  compensate for the input amplitude
-    m1=m0;
-    m0 = _mm_packs_epi32(m1,m0);        // 1- pack in a 128 bit register [re im re im]
-    y_128[0] = _mm_unpacklo_epi32(m0,m0);        // 1- pack in a 128 bit register [re im re im]
-    m0 = _mm_madd_epi16(x_128[1],alpha_128); //pmaddwd_r2r(mm1,mm0);         // 1- compute x1[0]*x2[0]
-    m0 = _mm_sra_epi32(m0,shift);        // 1- shift right by shift in order to  compensate for the input amplitude
-    m1 = m0;
-    m1 = _mm_packs_epi32(m1,m0);        // 1- pack in a 128 bit register [re im re im]
-    y_128[1] = _mm_unpacklo_epi32(m1,m1);        // 1- pack in a 128 bit register [re im re im]
-    m0 = _mm_madd_epi16(x_128[2],alpha_128); //pmaddwd_r2r(mm1,mm0);         // 1- compute x1[0]*x2[0]
-    m0 = _mm_sra_epi32(m0,shift);        // 1- shift right by shift in order to  compensate for the input amplitude
-    m1 = m0;
-    m1 = _mm_packs_epi32(m1,m0);        // 1- pack in a 128 bit register [re im re im]
-    y_128[2] = _mm_unpacklo_epi32(m1,m1);        // 1- pack in a 128 bit register [re im re im]
-    m0 = _mm_madd_epi16(x_128[3],alpha_128); //pmaddwd_r2r(mm1,mm0);         // 1- compute x1[0]*x2[0]
-    m0 = _mm_sra_epi32(m0,shift);        // 1- shift right by shift in order to  compensate for the input amplitude
-    m1 = m0;
-    m1 = _mm_packs_epi32(m1,m0);        // 1- pack in a 128 bit register [re im re im]
-    y_128[3] = _mm_unpacklo_epi32(m1,m1);        // 1- pack in a 128 bit register [re im re im]
-    if (format==1) {  // Put output in proper format (Re,-Im,Im,Re), shuffle = (0,1,3,2) = 0x1e
-      y_128[0] = _mm_shufflelo_epi16(y_128[0],0x1e);
-      y_128[0] = _mm_shufflehi_epi16(y_128[0],0x1e);
-      ((int16_t*)&y_128[0])[1] = -((int16_t*)&y_128[0])[1];
-      ((int16_t*)&y_128[0])[5] = -((int16_t*)&y_128[0])[5];
-      y_128[1] = _mm_shufflelo_epi16(y_128[1],0x1e);
-      y_128[1] = _mm_shufflehi_epi16(y_128[1],0x1e);
-      ((int16_t*)&y_128[1])[1] = -((int16_t*)&y_128[1])[1];
-      ((int16_t*)&y_128[1])[5] = -((int16_t*)&y_128[1])[5];
-      y_128[2] = _mm_shufflelo_epi16(y_128[2],0x1e);
-      y_128[2] = _mm_shufflehi_epi16(y_128[2],0x1e);
-      ((int16_t*)&y_128[2])[1] = -((int16_t*)&y_128[2])[1];
-      ((int16_t*)&y_128[2])[5] = -((int16_t*)&y_128[2])[5];
-      y_128[3] = _mm_shufflelo_epi16(y_128[3],0x1e);
-      y_128[3] = _mm_shufflehi_epi16(y_128[3],0x1e);
-      ((int16_t*)&y_128[3])[1] = -((int16_t*)&y_128[3])[1];
-      ((int16_t*)&y_128[3])[5] = -((int16_t*)&y_128[3])[5];
-    }
-    x_128+=4;
-    y_128 +=4;
-  }
-  _mm_empty();
-  _m_empty();
-  return(0);
-}
-int rotate_cpx_vector2(int16_t *x,
-                       int16_t *alpha,
-                       int16_t *y,
-                       uint32_t N,
-                       uint16_t output_shift,
-                       uint8_t format)
 {
-  // Multiply elementwise two complex vectors of N elements
+  // multiply a complex vector with a complex value (alpha)
-  // x        - input 1    in the format  |Re0  Im0 Re0 Im0|,......,|Re(N-1)  Im(N-1) Re(N-1) Im(N-1)|
+  // stores result in y
-  //            We assume x1 with a dynamic of 15 bit maximum
+  // N is the number of complex numbers
-  //
+  // output_shift reduces the result of the multiplication by this number of bits
-  // alpha      - input 2    in the format  |Re0 Im0|
+  AssertFatal(N%8==0, "To be developped");
-  //            We assume x2 with a dynamic of 15 bit maximum
+  const c16_t for_re={alpha[0], -alpha[1]};
-  //
+  __m256i const alpha_for_real =  _mm256_set1_epi32(*(uint32_t*)&for_re);
-  // y        - output     in the format  |Re0  Im0 Re0 Im0|,......,|Re(N-1)  Im(N-1) Re(N-1) Im(N-1)|
+  const c16_t for_im={alpha[1], alpha[0]};
-  //
+  __m256i const alpha_for_im= _mm256_set1_epi32(*(uint32_t*)&for_im);
-  // N        - the size f the vectors (this function does N cpx mpy. WARNING: N>=4;
+  __m256i const perm_mask =
-  //
+    _mm256_set_epi8(31,30,23,22,29,28,21,20,27,26,19,18,25,24,17,16,
-  // log2_amp - increase the output amplitude by a factor 2^log2_amp (default is 0)
+		    15,14,7,6,13,12,5,4,11,10,3,2,9,8,1,0);
-  //            WARNING: log2_amp>0 can cause overflow!!
+ __m256i* xd= (__m256i*)x;
+  const __m256i *end=xd+N/8;
-  uint32_t i;                 // loop counter
+  for( __m256i* yd = (__m256i *)y; xd<end ; yd++, xd++) {
+    const __m256i xre = _mm256_srai_epi32(_mm256_madd_epi16(*xd,alpha_for_real),
-  register __m128i m0,m1;
+					  output_shift);
+    const __m256i xim = _mm256_srai_epi32(_mm256_madd_epi16(*xd,alpha_for_im),
+					  output_shift);
-  __m128i *x_128;
+    // a bit faster than unpacklo+unpackhi+packs
-  __m128i *y_128;
+    const __m256i tmp=_mm256_packs_epi32(xre,xim);
+    *yd=_mm256_shuffle_epi8(tmp,perm_mask);
-  shift = _mm_cvtsi32_si128(output_shift);
-  x_128 = (__m128i *)&x[0];
-  if (format==0) {  // alpha is in shuffled format for complex multiply
-    ((int16_t *)&alpha_128)[0] = alpha[0];
-    ((int16_t *)&alpha_128)[1] = -alpha[1];
-    ((int16_t *)&alpha_128)[2] = alpha[1];
-    ((int16_t *)&alpha_128)[3] = alpha[0];
-    ((int16_t *)&alpha_128)[4] = alpha[0];
-    ((int16_t *)&alpha_128)[5] = -alpha[1];
-    ((int16_t *)&alpha_128)[6] = alpha[1];
-    ((int16_t *)&alpha_128)[7] = alpha[0];
-  } else { // input is in shuffled format for complex multiply
-    ((int16_t *)&alpha_128)[0] = alpha[0];
-    ((int16_t *)&alpha_128)[1] = alpha[1];
-    ((int16_t *)&alpha_128)[2] = alpha[0];
-    ((int16_t *)&alpha_128)[3] = alpha[1];
-    ((int16_t *)&alpha_128)[4] = alpha[0];
-    ((int16_t *)&alpha_128)[5] = alpha[1];
-    ((int16_t *)&alpha_128)[6] = alpha[0];
-    ((int16_t *)&alpha_128)[7] = alpha[1];
  }
-  y_128 = (__m128i *)&y[0];
-  // we compute 4 cpx multiply for each loop
-  for(i=0; i<(N>>1); i++) {
-    m0 = _mm_madd_epi16(x_128[i],alpha_128); //pmaddwd_r2r(mm1,mm0);         // 1- compute x1[0]*x2[0]
-    m0 = _mm_sra_epi32(m0,shift);        // 1- shift right by shift in order to  compensate for the input amplitude
-    m1=m0;
-    m1 = _mm_packs_epi32(m1,m0);        // 1- pack in a 128 bit register [re im re im]
-    y_128[i] = _mm_unpacklo_epi32(m1,m1);        // 1- pack in a 128 bit register [re im re im]
-    if (format==1) {  // Put output in proper format (Re,-Im,Im,Re), shuffle = (0,1,3,2) = 0x1e
-      y_128[i] = _mm_shufflelo_epi16(y_128[i],0x1e);
-      y_128[i] = _mm_shufflehi_epi16(y_128[i],0x1e);
-      ((int16_t*)&y_128[i])[1] = -((int16_t*)&y_128[i])[1];
-      ((int16_t*)&y_128[i])[5] = -((int16_t*)&y_128[i])[5];
-    }
-  }
-  _mm_empty();
-  _m_empty();
-  return(0);
 }
-*/
+#else 
+void rotate_cpx_vector(int16_t *x,
-int rotate_cpx_vector(int16_t *x,
                      int16_t *alpha,
                      int16_t *y,
                      uint32_t N,
@@ -439,9 +273,9 @@ int rotate_cpx_vector(int16_t *x,
  _mm_empty();
  _m_empty();
-  return(0);
+  return;
 }
+#endif
 /*
 int mult_vector32_scalar(int16_t *x1,
                         int x2,

--- a/openair1/PHY/TOOLS/tools_defs.h
+++ b/openair1/PHY/TOOLS/tools_defs.h
@@ -37,6 +37,7 @@ extern "C" {
 #include <stdint.h>
 #include <assert.h>
 #include "PHY/sse_intrin.h"
+#include "common/utils/assertions.h"
 #define CEILIDIV(a,b) ((a+b-1)/b)
 #define ROUNDIDIV(a,b) (((a<<1)+b)/(b<<1))
@@ -104,15 +105,6 @@ void multadd_complex_vector_real_scalar(int16_t *x,
                                        uint8_t zero_flag,
                                        uint32_t N);
-int rotate_cpx_vector(int16_t *x,
-                      int16_t *alpha,
-                      int16_t *y,
-                      uint32_t N,
-                      uint16_t output_shift);
 /*!\fn void init_fft(uint16_t size,uint8_t logsize,uint16_t *rev)
 \brief Initialize the FFT engine for a given size
 @param size Size of the FFT
@@ -471,7 +463,7 @@ This function performs componentwise multiplication of a vector with a complex s
 The function implemented is : \f$\mathbf{y} = \alpha\mathbf{x}\f$
 */
-int32_t rotate_cpx_vector(int16_t *x,
+void rotate_cpx_vector(int16_t *x,
                          int16_t *alpha,
                          int16_t *y,
                          uint32_t N,