Skip to content
Projects
Groups
Snippets
Help
Loading...
Help
Support
Keyboard shortcuts
?
Submit feedback
Contribute to GitLab
Sign in / Register
Toggle navigation
O
OpenXG-RAN
Project overview
Project overview
Details
Activity
Releases
Repository
Repository
Files
Commits
Branches
Tags
Contributors
Graph
Compare
Issues
0
Issues
0
List
Boards
Labels
Milestones
Merge Requests
0
Merge Requests
0
CI / CD
CI / CD
Pipelines
Jobs
Schedules
Analytics
Analytics
CI / CD
Repository
Value Stream
Wiki
Wiki
Snippets
Snippets
Members
Members
Collapse sidebar
Close sidebar
Activity
Graph
Create a new issue
Jobs
Commits
Issue Boards
Open sidebar
Michael Black
OpenXG-RAN
Commits
bf219fc2
Commit
bf219fc2
authored
Jun 16, 2022
by
Laurent Thomas
Committed by
laurent
Jun 27, 2022
Browse files
Options
Browse Files
Download
Email Patches
Plain Diff
avx2 and better simd algo
parent
6d030ea2
Changes
2
Hide whitespace changes
Inline
Side-by-side
Showing
2 changed files
with
31 additions
and
205 deletions
+31
-205
openair1/PHY/TOOLS/cmult_sv.c
openair1/PHY/TOOLS/cmult_sv.c
+29
-195
openair1/PHY/TOOLS/tools_defs.h
openair1/PHY/TOOLS/tools_defs.h
+2
-10
No files found.
openair1/PHY/TOOLS/cmult_sv.c
View file @
bf219fc2
...
...
@@ -144,205 +144,39 @@ void multadd_real_four_symbols_vector_complex_scalar(int16_t *x,
_m_empty
();
}
/*
int rotate_cpx_vector(int16_t *x,
#ifdef __AVX2__
void
rotate_cpx_vector
(
int16_t
*
x
,
int16_t
*
alpha
,
int16_t
*
y
,
uint32_t
N
,
uint16_t output_shift,
uint8_t format)
{
// Multiply elementwise two complex vectors of N elements
// x - input 1 in the format |Re0 Im0 Re0 Im0|,......,|Re(N-1) Im(N-1) Re(N-1) Im(N-1)|
// We assume x1 with a dynamic of 15 bit maximum
//
// alpha - input 2 in the format |Re0 Im0|
// We assume x2 with a dynamic of 15 bit maximum
//
// y - output in the format |Re0 Im0 Re0 Im0|,......,|Re(N-1) Im(N-1) Re(N-1) Im(N-1)|
//
// N - the size f the vectors (this function does N cpx mpy. WARNING: N>=4;
//
// output_shift - shift at output to return in Q1.15
// format - 0 means alpha is in shuffled format, 1 means x is in shuffled format
uint32_t i; // loop counter
register __m128i m0,m1;
__m128i *x_128;
__m128i *y_128;
shift = _mm_cvtsi32_si128(output_shift);
x_128 = (__m128i *)&x[0];
if (format==0) { // alpha is in shuffled format for complex multiply
((int16_t *)&alpha_128)[0] = alpha[0];
((int16_t *)&alpha_128)[1] = -alpha[1];
((int16_t *)&alpha_128)[2] = alpha[1];
((int16_t *)&alpha_128)[3] = alpha[0];
((int16_t *)&alpha_128)[4] = alpha[0];
((int16_t *)&alpha_128)[5] = -alpha[1];
((int16_t *)&alpha_128)[6] = alpha[1];
((int16_t *)&alpha_128)[7] = alpha[0];
} else { // input is in shuffled format for complex multiply
((int16_t *)&alpha_128)[0] = alpha[0];
((int16_t *)&alpha_128)[1] = alpha[1];
((int16_t *)&alpha_128)[2] = alpha[0];
((int16_t *)&alpha_128)[3] = alpha[1];
((int16_t *)&alpha_128)[4] = alpha[0];
((int16_t *)&alpha_128)[5] = alpha[1];
((int16_t *)&alpha_128)[6] = alpha[0];
((int16_t *)&alpha_128)[7] = alpha[1];
}
y_128 = (__m128i *)&y[0];
// _mm_empty();
// return(0);
// we compute 4 cpx multiply for each loop
for(i=0; i<(N>>3); i++) {
m0 = _mm_madd_epi16(x_128[0],alpha_128); //pmaddwd_r2r(mm1,mm0); // 1- compute x1[0]*x2[0]
m0 = _mm_sra_epi32(m0,shift); // 1- shift right by shift in order to compensate for the input amplitude
m1=m0;
m0 = _mm_packs_epi32(m1,m0); // 1- pack in a 128 bit register [re im re im]
y_128[0] = _mm_unpacklo_epi32(m0,m0); // 1- pack in a 128 bit register [re im re im]
m0 = _mm_madd_epi16(x_128[1],alpha_128); //pmaddwd_r2r(mm1,mm0); // 1- compute x1[0]*x2[0]
m0 = _mm_sra_epi32(m0,shift); // 1- shift right by shift in order to compensate for the input amplitude
m1 = m0;
m1 = _mm_packs_epi32(m1,m0); // 1- pack in a 128 bit register [re im re im]
y_128[1] = _mm_unpacklo_epi32(m1,m1); // 1- pack in a 128 bit register [re im re im]
m0 = _mm_madd_epi16(x_128[2],alpha_128); //pmaddwd_r2r(mm1,mm0); // 1- compute x1[0]*x2[0]
m0 = _mm_sra_epi32(m0,shift); // 1- shift right by shift in order to compensate for the input amplitude
m1 = m0;
m1 = _mm_packs_epi32(m1,m0); // 1- pack in a 128 bit register [re im re im]
y_128[2] = _mm_unpacklo_epi32(m1,m1); // 1- pack in a 128 bit register [re im re im]
m0 = _mm_madd_epi16(x_128[3],alpha_128); //pmaddwd_r2r(mm1,mm0); // 1- compute x1[0]*x2[0]
m0 = _mm_sra_epi32(m0,shift); // 1- shift right by shift in order to compensate for the input amplitude
m1 = m0;
m1 = _mm_packs_epi32(m1,m0); // 1- pack in a 128 bit register [re im re im]
y_128[3] = _mm_unpacklo_epi32(m1,m1); // 1- pack in a 128 bit register [re im re im]
if (format==1) { // Put output in proper format (Re,-Im,Im,Re), shuffle = (0,1,3,2) = 0x1e
y_128[0] = _mm_shufflelo_epi16(y_128[0],0x1e);
y_128[0] = _mm_shufflehi_epi16(y_128[0],0x1e);
((int16_t*)&y_128[0])[1] = -((int16_t*)&y_128[0])[1];
((int16_t*)&y_128[0])[5] = -((int16_t*)&y_128[0])[5];
y_128[1] = _mm_shufflelo_epi16(y_128[1],0x1e);
y_128[1] = _mm_shufflehi_epi16(y_128[1],0x1e);
((int16_t*)&y_128[1])[1] = -((int16_t*)&y_128[1])[1];
((int16_t*)&y_128[1])[5] = -((int16_t*)&y_128[1])[5];
y_128[2] = _mm_shufflelo_epi16(y_128[2],0x1e);
y_128[2] = _mm_shufflehi_epi16(y_128[2],0x1e);
((int16_t*)&y_128[2])[1] = -((int16_t*)&y_128[2])[1];
((int16_t*)&y_128[2])[5] = -((int16_t*)&y_128[2])[5];
y_128[3] = _mm_shufflelo_epi16(y_128[3],0x1e);
y_128[3] = _mm_shufflehi_epi16(y_128[3],0x1e);
((int16_t*)&y_128[3])[1] = -((int16_t*)&y_128[3])[1];
((int16_t*)&y_128[3])[5] = -((int16_t*)&y_128[3])[5];
}
x_128+=4;
y_128 +=4;
}
_mm_empty();
_m_empty();
return(0);
}
int rotate_cpx_vector2(int16_t *x,
int16_t *alpha,
int16_t *y,
uint32_t N,
uint16_t output_shift,
uint8_t format)
uint16_t
output_shift
)
{
// Multiply elementwise two complex vectors of N elements
// x - input 1 in the format |Re0 Im0 Re0 Im0|,......,|Re(N-1) Im(N-1) Re(N-1) Im(N-1)|
// We assume x1 with a dynamic of 15 bit maximum
//
// alpha - input 2 in the format |Re0 Im0|
// We assume x2 with a dynamic of 15 bit maximum
//
// y - output in the format |Re0 Im0 Re0 Im0|,......,|Re(N-1) Im(N-1) Re(N-1) Im(N-1)|
//
// N - the size f the vectors (this function does N cpx mpy. WARNING: N>=4;
//
// log2_amp - increase the output amplitude by a factor 2^log2_amp (default is 0)
// WARNING: log2_amp>0 can cause overflow!!
uint32_t i; // loop counter
register __m128i m0,m1;
__m128i *x_128;
__m128i *y_128;
shift = _mm_cvtsi32_si128(output_shift);
x_128 = (__m128i *)&x[0];
if (format==0) { // alpha is in shuffled format for complex multiply
((int16_t *)&alpha_128)[0] = alpha[0];
((int16_t *)&alpha_128)[1] = -alpha[1];
((int16_t *)&alpha_128)[2] = alpha[1];
((int16_t *)&alpha_128)[3] = alpha[0];
((int16_t *)&alpha_128)[4] = alpha[0];
((int16_t *)&alpha_128)[5] = -alpha[1];
((int16_t *)&alpha_128)[6] = alpha[1];
((int16_t *)&alpha_128)[7] = alpha[0];
} else { // input is in shuffled format for complex multiply
((int16_t *)&alpha_128)[0] = alpha[0];
((int16_t *)&alpha_128)[1] = alpha[1];
((int16_t *)&alpha_128)[2] = alpha[0];
((int16_t *)&alpha_128)[3] = alpha[1];
((int16_t *)&alpha_128)[4] = alpha[0];
((int16_t *)&alpha_128)[5] = alpha[1];
((int16_t *)&alpha_128)[6] = alpha[0];
((int16_t *)&alpha_128)[7] = alpha[1];
// multiply a complex vector with a complex value (alpha)
// stores result in y
// N is the number of complex numbers
// output_shift reduces the result of the multiplication by this number of bits
AssertFatal
(
N
%
8
==
0
,
"To be developped"
);
const
c16_t
for_re
=
{
alpha
[
0
],
-
alpha
[
1
]};
__m256i
const
alpha_for_real
=
_mm256_set1_epi32
(
*
(
uint32_t
*
)
&
for_re
);
const
c16_t
for_im
=
{
alpha
[
1
],
alpha
[
0
]};
__m256i
const
alpha_for_im
=
_mm256_set1_epi32
(
*
(
uint32_t
*
)
&
for_im
);
__m256i
const
perm_mask
=
_mm256_set_epi8
(
31
,
30
,
23
,
22
,
29
,
28
,
21
,
20
,
27
,
26
,
19
,
18
,
25
,
24
,
17
,
16
,
15
,
14
,
7
,
6
,
13
,
12
,
5
,
4
,
11
,
10
,
3
,
2
,
9
,
8
,
1
,
0
);
__m256i
*
xd
=
(
__m256i
*
)
x
;
const
__m256i
*
end
=
xd
+
N
/
8
;
for
(
__m256i
*
yd
=
(
__m256i
*
)
y
;
xd
<
end
;
yd
++
,
xd
++
)
{
const
__m256i
xre
=
_mm256_srai_epi32
(
_mm256_madd_epi16
(
*
xd
,
alpha_for_real
),
output_shift
);
const
__m256i
xim
=
_mm256_srai_epi32
(
_mm256_madd_epi16
(
*
xd
,
alpha_for_im
),
output_shift
);
// a bit faster than unpacklo+unpackhi+packs
const
__m256i
tmp
=
_mm256_packs_epi32
(
xre
,
xim
);
*
yd
=
_mm256_shuffle_epi8
(
tmp
,
perm_mask
);
}
y_128 = (__m128i *)&y[0];
// we compute 4 cpx multiply for each loop
for(i=0; i<(N>>1); i++) {
m0 = _mm_madd_epi16(x_128[i],alpha_128); //pmaddwd_r2r(mm1,mm0); // 1- compute x1[0]*x2[0]
m0 = _mm_sra_epi32(m0,shift); // 1- shift right by shift in order to compensate for the input amplitude
m1=m0;
m1 = _mm_packs_epi32(m1,m0); // 1- pack in a 128 bit register [re im re im]
y_128[i] = _mm_unpacklo_epi32(m1,m1); // 1- pack in a 128 bit register [re im re im]
if (format==1) { // Put output in proper format (Re,-Im,Im,Re), shuffle = (0,1,3,2) = 0x1e
y_128[i] = _mm_shufflelo_epi16(y_128[i],0x1e);
y_128[i] = _mm_shufflehi_epi16(y_128[i],0x1e);
((int16_t*)&y_128[i])[1] = -((int16_t*)&y_128[i])[1];
((int16_t*)&y_128[i])[5] = -((int16_t*)&y_128[i])[5];
}
}
_mm_empty();
_m_empty();
return(0);
}
*/
int
rotate_cpx_vector
(
int16_t
*
x
,
#else
void
rotate_cpx_vector
(
int16_t
*
x
,
int16_t
*
alpha
,
int16_t
*
y
,
uint32_t
N
,
...
...
@@ -439,9 +273,9 @@ int rotate_cpx_vector(int16_t *x,
_mm_empty
();
_m_empty
();
return
(
0
)
;
return
;
}
#endif
/*
int mult_vector32_scalar(int16_t *x1,
int x2,
...
...
openair1/PHY/TOOLS/tools_defs.h
View file @
bf219fc2
...
...
@@ -37,6 +37,7 @@ extern "C" {
#include <stdint.h>
#include <assert.h>
#include "PHY/sse_intrin.h"
#include "common/utils/assertions.h"
#define CEILIDIV(a,b) ((a+b-1)/b)
#define ROUNDIDIV(a,b) (((a<<1)+b)/(b<<1))
...
...
@@ -104,15 +105,6 @@ void multadd_complex_vector_real_scalar(int16_t *x,
uint8_t
zero_flag
,
uint32_t
N
);
int
rotate_cpx_vector
(
int16_t
*
x
,
int16_t
*
alpha
,
int16_t
*
y
,
uint32_t
N
,
uint16_t
output_shift
);
/*!\fn void init_fft(uint16_t size,uint8_t logsize,uint16_t *rev)
\brief Initialize the FFT engine for a given size
@param size Size of the FFT
...
...
@@ -471,7 +463,7 @@ This function performs componentwise multiplication of a vector with a complex s
The function implemented is : \f$\mathbf{y} = \alpha\mathbf{x}\f$
*/
int32_t
rotate_cpx_vector
(
int16_t
*
x
,
void
rotate_cpx_vector
(
int16_t
*
x
,
int16_t
*
alpha
,
int16_t
*
y
,
uint32_t
N
,
...
...
Write
Preview
Markdown
is supported
0%
Try again
or
attach a new file
Attach a file
Cancel
You are about to add
0
people
to the discussion. Proceed with caution.
Finish editing this message first!
Cancel
Please
register
or
sign in
to comment