Commit ba0e786e authored by Florian Kaltenberger's avatar Florian Kaltenberger

adding multadd_cpx_vector

parent 27a5ccf4
......@@ -27,6 +27,7 @@
#if defined(__x86_64__) || defined(__i386__)
int16_t conjug[8]__attribute__((aligned(16))) = {-1,1,-1,1,-1,1,-1,1} ;
int16_t conjug2[8]__attribute__((aligned(16))) = {1,-1,1,-1,1,-1,1,-1} ;
#define simd_q15_t __m128i
#define simdshort_q15_t __m64
#elif defined(__arm__)
......@@ -134,3 +135,81 @@ int mult_cpx_conj_vector(int16_t *x1,
return(0);
}
int multadd_cpx_vector(int16_t *x1,
int16_t *x2,
int16_t *y,
uint8_t zero_flag,
uint32_t N,
int output_shift)
{
// Multiply elementwise the complex conjugate of x1 with x2.
// x1 - input 1 in the format |Re0 Im0 Re1 Im1|,......,|Re(N-2) Im(N-2) Re(N-1) Im(N-1)|
// We assume x1 with a dinamic of 15 bit maximum
//
// x2 - input 2 in the format |Re0 Im0 Re1 Im1|,......,|Re(N-2) Im(N-2) Re(N-1) Im(N-1)|
// We assume x2 with a dinamic of 14 bit maximum
///
// y - output in the format |Re0 Im0 Re1 Im1|,......,|Re(N-2) Im(N-2) Re(N-1) Im(N-1)|
//
// zero_flag - Set output (y) to zero prior to disable accumulation
//
// N - the size f the vectors (this function does N cpx mpy. WARNING: N>=4;
//
// output_shift - shift to be applied to generate output
uint32_t i; // loop counter
simd_q15_t *x1_128;
simd_q15_t *x2_128;
simd_q15_t *y_128;
#if defined(__x86_64__) || defined(__i386__)
simd_q15_t tmp_re,tmp_im;
simd_q15_t tmpy0,tmpy1;
#elif defined(__arm__)
int32x4_t tmp_re,tmp_im;
int32x4_t tmp_re1,tmp_im1;
int16x4x2_t tmpy;
int32x4_t shift = vdupq_n_s32(-output_shift);
#endif
x1_128 = (simd_q15_t *)&x1[0];
x2_128 = (simd_q15_t *)&x2[0];
y_128 = (simd_q15_t *)&y[0];
// we compute 4 cpx multiply for each loop
for(i=0; i<(N>>2); i++) {
#if defined(__x86_64__) || defined(__i386__)
tmp_re = _mm_sign_epi16(*x1_128,*(__m128i*)&conjug2[0]);
tmp_re = _mm_madd_epi16(tmp_re,*x2_128);
tmp_im = _mm_shufflelo_epi16(*x1_128,_MM_SHUFFLE(2,3,0,1));
tmp_im = _mm_shufflehi_epi16(tmp_im,_MM_SHUFFLE(2,3,0,1));
tmp_im = _mm_madd_epi16(tmp_im,*x2_128);
tmp_re = _mm_srai_epi32(tmp_re,output_shift);
tmp_im = _mm_srai_epi32(tmp_im,output_shift);
tmpy0 = _mm_unpacklo_epi32(tmp_re,tmp_im);
//print_ints("unpack lo:",&tmpy0[i]);
tmpy1 = _mm_unpackhi_epi32(tmp_re,tmp_im);
//print_ints("unpack hi:",&tmpy1[i]);
if (zero_flag == 1)
*y_128 = _mm_packs_epi32(tmpy0,tmpy1);
else
*y_128 = _mm_adds_epi16(*y_128,_mm_packs_epi32(tmpy0,tmpy1));
//print_shorts("*y_128:",&y_128[i]);
#elif defined(__arm__)
msg("mult_cpx_vector not implemented for __arm__");
#endif
x1_128++;
x2_128++;
y_128++;
}
_mm_empty();
_m_empty();
return(0);
}
......@@ -126,6 +126,25 @@ int mult_cpx_conj_vector(int16_t *x1,
int output_shift,
int madd);
/*!
Element-wise multiplication and accumulation of two complex vectors x1 and x2.
@param x1 - input 1 in the format |Re0 Im0 Re1 Im1|,......,|Re(N-2) Im(N-2) Re(N-1) Im(N-1)|
We assume x1 with a dinamic of 15 bit maximum
@param x2 - input 2 in the format |Re0 Im0 Re1 Im1|,......,|Re(N-2) Im(N-2) Re(N-1) Im(N-1)|
We assume x2 with a dinamic of 14 bit maximum
@param y - output in the format |Re0 Im0 Re1 Im1|,......,|Re(N-2) Im(N-2) Re(N-1) Im(N-1)|
@param zero_flag Set output (y) to zero prior to accumulation
@param N - the size f the vectors (this function does N cpx mpy. WARNING: N>=4;
@param output_shift - shift to be applied to generate output
*/
int multadd_cpx_vector(int16_t *x1,
int16_t *x2,
int16_t *y,
uint8_t zero_flag,
uint32_t N,
int output_shift);
// lte_dfts.c
void init_fft(uint16_t size,
uint8_t logsize,
......
Markdown is supported
0%
or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment