Commit bf219fc2 authored by Laurent Thomas's avatar Laurent Thomas Committed by laurent

avx2 and better simd algo

parent 6d030ea2
...@@ -144,205 +144,39 @@ void multadd_real_four_symbols_vector_complex_scalar(int16_t *x, ...@@ -144,205 +144,39 @@ void multadd_real_four_symbols_vector_complex_scalar(int16_t *x,
_m_empty(); _m_empty();
} }
#ifdef __AVX2__
/* void rotate_cpx_vector(int16_t *x,
int rotate_cpx_vector(int16_t *x,
int16_t *alpha,
int16_t *y,
uint32_t N,
uint16_t output_shift,
uint8_t format)
{
// Multiply elementwise two complex vectors of N elements
// x - input 1 in the format |Re0 Im0 Re0 Im0|,......,|Re(N-1) Im(N-1) Re(N-1) Im(N-1)|
// We assume x1 with a dynamic of 15 bit maximum
//
// alpha - input 2 in the format |Re0 Im0|
// We assume x2 with a dynamic of 15 bit maximum
//
// y - output in the format |Re0 Im0 Re0 Im0|,......,|Re(N-1) Im(N-1) Re(N-1) Im(N-1)|
//
// N - the size f the vectors (this function does N cpx mpy. WARNING: N>=4;
//
// output_shift - shift at output to return in Q1.15
// format - 0 means alpha is in shuffled format, 1 means x is in shuffled format
uint32_t i; // loop counter
register __m128i m0,m1;
__m128i *x_128;
__m128i *y_128;
shift = _mm_cvtsi32_si128(output_shift);
x_128 = (__m128i *)&x[0];
if (format==0) { // alpha is in shuffled format for complex multiply
((int16_t *)&alpha_128)[0] = alpha[0];
((int16_t *)&alpha_128)[1] = -alpha[1];
((int16_t *)&alpha_128)[2] = alpha[1];
((int16_t *)&alpha_128)[3] = alpha[0];
((int16_t *)&alpha_128)[4] = alpha[0];
((int16_t *)&alpha_128)[5] = -alpha[1];
((int16_t *)&alpha_128)[6] = alpha[1];
((int16_t *)&alpha_128)[7] = alpha[0];
} else { // input is in shuffled format for complex multiply
((int16_t *)&alpha_128)[0] = alpha[0];
((int16_t *)&alpha_128)[1] = alpha[1];
((int16_t *)&alpha_128)[2] = alpha[0];
((int16_t *)&alpha_128)[3] = alpha[1];
((int16_t *)&alpha_128)[4] = alpha[0];
((int16_t *)&alpha_128)[5] = alpha[1];
((int16_t *)&alpha_128)[6] = alpha[0];
((int16_t *)&alpha_128)[7] = alpha[1];
}
y_128 = (__m128i *)&y[0];
// _mm_empty();
// return(0);
// we compute 4 cpx multiply for each loop
for(i=0; i<(N>>3); i++) {
m0 = _mm_madd_epi16(x_128[0],alpha_128); //pmaddwd_r2r(mm1,mm0); // 1- compute x1[0]*x2[0]
m0 = _mm_sra_epi32(m0,shift); // 1- shift right by shift in order to compensate for the input amplitude
m1=m0;
m0 = _mm_packs_epi32(m1,m0); // 1- pack in a 128 bit register [re im re im]
y_128[0] = _mm_unpacklo_epi32(m0,m0); // 1- pack in a 128 bit register [re im re im]
m0 = _mm_madd_epi16(x_128[1],alpha_128); //pmaddwd_r2r(mm1,mm0); // 1- compute x1[0]*x2[0]
m0 = _mm_sra_epi32(m0,shift); // 1- shift right by shift in order to compensate for the input amplitude
m1 = m0;
m1 = _mm_packs_epi32(m1,m0); // 1- pack in a 128 bit register [re im re im]
y_128[1] = _mm_unpacklo_epi32(m1,m1); // 1- pack in a 128 bit register [re im re im]
m0 = _mm_madd_epi16(x_128[2],alpha_128); //pmaddwd_r2r(mm1,mm0); // 1- compute x1[0]*x2[0]
m0 = _mm_sra_epi32(m0,shift); // 1- shift right by shift in order to compensate for the input amplitude
m1 = m0;
m1 = _mm_packs_epi32(m1,m0); // 1- pack in a 128 bit register [re im re im]
y_128[2] = _mm_unpacklo_epi32(m1,m1); // 1- pack in a 128 bit register [re im re im]
m0 = _mm_madd_epi16(x_128[3],alpha_128); //pmaddwd_r2r(mm1,mm0); // 1- compute x1[0]*x2[0]
m0 = _mm_sra_epi32(m0,shift); // 1- shift right by shift in order to compensate for the input amplitude
m1 = m0;
m1 = _mm_packs_epi32(m1,m0); // 1- pack in a 128 bit register [re im re im]
y_128[3] = _mm_unpacklo_epi32(m1,m1); // 1- pack in a 128 bit register [re im re im]
if (format==1) { // Put output in proper format (Re,-Im,Im,Re), shuffle = (0,1,3,2) = 0x1e
y_128[0] = _mm_shufflelo_epi16(y_128[0],0x1e);
y_128[0] = _mm_shufflehi_epi16(y_128[0],0x1e);
((int16_t*)&y_128[0])[1] = -((int16_t*)&y_128[0])[1];
((int16_t*)&y_128[0])[5] = -((int16_t*)&y_128[0])[5];
y_128[1] = _mm_shufflelo_epi16(y_128[1],0x1e);
y_128[1] = _mm_shufflehi_epi16(y_128[1],0x1e);
((int16_t*)&y_128[1])[1] = -((int16_t*)&y_128[1])[1];
((int16_t*)&y_128[1])[5] = -((int16_t*)&y_128[1])[5];
y_128[2] = _mm_shufflelo_epi16(y_128[2],0x1e);
y_128[2] = _mm_shufflehi_epi16(y_128[2],0x1e);
((int16_t*)&y_128[2])[1] = -((int16_t*)&y_128[2])[1];
((int16_t*)&y_128[2])[5] = -((int16_t*)&y_128[2])[5];
y_128[3] = _mm_shufflelo_epi16(y_128[3],0x1e);
y_128[3] = _mm_shufflehi_epi16(y_128[3],0x1e);
((int16_t*)&y_128[3])[1] = -((int16_t*)&y_128[3])[1];
((int16_t*)&y_128[3])[5] = -((int16_t*)&y_128[3])[5];
}
x_128+=4;
y_128 +=4;
}
_mm_empty();
_m_empty();
return(0);
}
int rotate_cpx_vector2(int16_t *x,
int16_t *alpha, int16_t *alpha,
int16_t *y, int16_t *y,
uint32_t N, uint32_t N,
uint16_t output_shift, uint16_t output_shift)
uint8_t format)
{ {
// Multiply elementwise two complex vectors of N elements // multiply a complex vector with a complex value (alpha)
// x - input 1 in the format |Re0 Im0 Re0 Im0|,......,|Re(N-1) Im(N-1) Re(N-1) Im(N-1)| // stores result in y
// We assume x1 with a dynamic of 15 bit maximum // N is the number of complex numbers
// // output_shift reduces the result of the multiplication by this number of bits
// alpha - input 2 in the format |Re0 Im0| AssertFatal(N%8==0, "To be developped");
// We assume x2 with a dynamic of 15 bit maximum const c16_t for_re={alpha[0], -alpha[1]};
// __m256i const alpha_for_real = _mm256_set1_epi32(*(uint32_t*)&for_re);
// y - output in the format |Re0 Im0 Re0 Im0|,......,|Re(N-1) Im(N-1) Re(N-1) Im(N-1)| const c16_t for_im={alpha[1], alpha[0]};
// __m256i const alpha_for_im= _mm256_set1_epi32(*(uint32_t*)&for_im);
// N - the size f the vectors (this function does N cpx mpy. WARNING: N>=4; __m256i const perm_mask =
// _mm256_set_epi8(31,30,23,22,29,28,21,20,27,26,19,18,25,24,17,16,
// log2_amp - increase the output amplitude by a factor 2^log2_amp (default is 0) 15,14,7,6,13,12,5,4,11,10,3,2,9,8,1,0);
// WARNING: log2_amp>0 can cause overflow!! __m256i* xd= (__m256i*)x;
const __m256i *end=xd+N/8;
uint32_t i; // loop counter for( __m256i* yd = (__m256i *)y; xd<end ; yd++, xd++) {
const __m256i xre = _mm256_srai_epi32(_mm256_madd_epi16(*xd,alpha_for_real),
register __m128i m0,m1; output_shift);
const __m256i xim = _mm256_srai_epi32(_mm256_madd_epi16(*xd,alpha_for_im),
output_shift);
__m128i *x_128; // a bit faster than unpacklo+unpackhi+packs
__m128i *y_128; const __m256i tmp=_mm256_packs_epi32(xre,xim);
*yd=_mm256_shuffle_epi8(tmp,perm_mask);
shift = _mm_cvtsi32_si128(output_shift);
x_128 = (__m128i *)&x[0];
if (format==0) { // alpha is in shuffled format for complex multiply
((int16_t *)&alpha_128)[0] = alpha[0];
((int16_t *)&alpha_128)[1] = -alpha[1];
((int16_t *)&alpha_128)[2] = alpha[1];
((int16_t *)&alpha_128)[3] = alpha[0];
((int16_t *)&alpha_128)[4] = alpha[0];
((int16_t *)&alpha_128)[5] = -alpha[1];
((int16_t *)&alpha_128)[6] = alpha[1];
((int16_t *)&alpha_128)[7] = alpha[0];
} else { // input is in shuffled format for complex multiply
((int16_t *)&alpha_128)[0] = alpha[0];
((int16_t *)&alpha_128)[1] = alpha[1];
((int16_t *)&alpha_128)[2] = alpha[0];
((int16_t *)&alpha_128)[3] = alpha[1];
((int16_t *)&alpha_128)[4] = alpha[0];
((int16_t *)&alpha_128)[5] = alpha[1];
((int16_t *)&alpha_128)[6] = alpha[0];
((int16_t *)&alpha_128)[7] = alpha[1];
}
y_128 = (__m128i *)&y[0];
// we compute 4 cpx multiply for each loop
for(i=0; i<(N>>1); i++) {
m0 = _mm_madd_epi16(x_128[i],alpha_128); //pmaddwd_r2r(mm1,mm0); // 1- compute x1[0]*x2[0]
m0 = _mm_sra_epi32(m0,shift); // 1- shift right by shift in order to compensate for the input amplitude
m1=m0;
m1 = _mm_packs_epi32(m1,m0); // 1- pack in a 128 bit register [re im re im]
y_128[i] = _mm_unpacklo_epi32(m1,m1); // 1- pack in a 128 bit register [re im re im]
if (format==1) { // Put output in proper format (Re,-Im,Im,Re), shuffle = (0,1,3,2) = 0x1e
y_128[i] = _mm_shufflelo_epi16(y_128[i],0x1e);
y_128[i] = _mm_shufflehi_epi16(y_128[i],0x1e);
((int16_t*)&y_128[i])[1] = -((int16_t*)&y_128[i])[1];
((int16_t*)&y_128[i])[5] = -((int16_t*)&y_128[i])[5];
} }
}
_mm_empty();
_m_empty();
return(0);
} }
*/ #else
void rotate_cpx_vector(int16_t *x,
int rotate_cpx_vector(int16_t *x,
int16_t *alpha, int16_t *alpha,
int16_t *y, int16_t *y,
uint32_t N, uint32_t N,
...@@ -439,9 +273,9 @@ int rotate_cpx_vector(int16_t *x, ...@@ -439,9 +273,9 @@ int rotate_cpx_vector(int16_t *x,
_mm_empty(); _mm_empty();
_m_empty(); _m_empty();
return(0); return;
} }
#endif
/* /*
int mult_vector32_scalar(int16_t *x1, int mult_vector32_scalar(int16_t *x1,
int x2, int x2,
......
...@@ -37,6 +37,7 @@ extern "C" { ...@@ -37,6 +37,7 @@ extern "C" {
#include <stdint.h> #include <stdint.h>
#include <assert.h> #include <assert.h>
#include "PHY/sse_intrin.h" #include "PHY/sse_intrin.h"
#include "common/utils/assertions.h"
#define CEILIDIV(a,b) ((a+b-1)/b) #define CEILIDIV(a,b) ((a+b-1)/b)
#define ROUNDIDIV(a,b) (((a<<1)+b)/(b<<1)) #define ROUNDIDIV(a,b) (((a<<1)+b)/(b<<1))
...@@ -104,15 +105,6 @@ void multadd_complex_vector_real_scalar(int16_t *x, ...@@ -104,15 +105,6 @@ void multadd_complex_vector_real_scalar(int16_t *x,
uint8_t zero_flag, uint8_t zero_flag,
uint32_t N); uint32_t N);
int rotate_cpx_vector(int16_t *x,
int16_t *alpha,
int16_t *y,
uint32_t N,
uint16_t output_shift);
/*!\fn void init_fft(uint16_t size,uint8_t logsize,uint16_t *rev) /*!\fn void init_fft(uint16_t size,uint8_t logsize,uint16_t *rev)
\brief Initialize the FFT engine for a given size \brief Initialize the FFT engine for a given size
@param size Size of the FFT @param size Size of the FFT
...@@ -471,7 +463,7 @@ This function performs componentwise multiplication of a vector with a complex s ...@@ -471,7 +463,7 @@ This function performs componentwise multiplication of a vector with a complex s
The function implemented is : \f$\mathbf{y} = \alpha\mathbf{x}\f$ The function implemented is : \f$\mathbf{y} = \alpha\mathbf{x}\f$
*/ */
int32_t rotate_cpx_vector(int16_t *x, void rotate_cpx_vector(int16_t *x,
int16_t *alpha, int16_t *alpha,
int16_t *y, int16_t *y,
uint32_t N, uint32_t N,
......
Markdown is supported
0%
or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment