Skip to content
Projects
Groups
Snippets
Help
Loading...
Help
Support
Keyboard shortcuts
?
Submit feedback
Contribute to GitLab
Sign in / Register
Toggle navigation
O
OpenXG-RAN
Project overview
Project overview
Details
Activity
Releases
Repository
Repository
Files
Commits
Branches
Tags
Contributors
Graph
Compare
Issues
0
Issues
0
List
Boards
Labels
Milestones
Merge Requests
0
Merge Requests
0
CI / CD
CI / CD
Pipelines
Jobs
Schedules
Analytics
Analytics
CI / CD
Repository
Value Stream
Wiki
Wiki
Snippets
Snippets
Members
Members
Collapse sidebar
Close sidebar
Activity
Graph
Create a new issue
Jobs
Commits
Issue Boards
Open sidebar
zzha zzha
OpenXG-RAN
Commits
bf219fc2
Commit
bf219fc2
authored
Jun 16, 2022
by
Laurent Thomas
Committed by
laurent
Jun 27, 2022
Browse files
Options
Browse Files
Download
Email Patches
Plain Diff
avx2 and better simd algo
parent
6d030ea2
Changes
2
Hide whitespace changes
Inline
Side-by-side
Showing
2 changed files
with
31 additions
and
205 deletions
+31
-205
openair1/PHY/TOOLS/cmult_sv.c
openair1/PHY/TOOLS/cmult_sv.c
+29
-195
openair1/PHY/TOOLS/tools_defs.h
openair1/PHY/TOOLS/tools_defs.h
+2
-10
No files found.
openair1/PHY/TOOLS/cmult_sv.c
View file @
bf219fc2
...
@@ -144,205 +144,39 @@ void multadd_real_four_symbols_vector_complex_scalar(int16_t *x,
...
@@ -144,205 +144,39 @@ void multadd_real_four_symbols_vector_complex_scalar(int16_t *x,
_m_empty
();
_m_empty
();
}
}
#ifdef __AVX2__
/*
void
rotate_cpx_vector
(
int16_t
*
x
,
int rotate_cpx_vector(int16_t *x,
int16_t
*
alpha
,
int16_t
*
alpha
,
int16_t
*
y
,
int16_t
*
y
,
uint32_t
N
,
uint32_t
N
,
uint16_t output_shift,
uint16_t
output_shift
)
uint8_t format)
{
// Multiply elementwise two complex vectors of N elements
// x - input 1 in the format |Re0 Im0 Re0 Im0|,......,|Re(N-1) Im(N-1) Re(N-1) Im(N-1)|
// We assume x1 with a dynamic of 15 bit maximum
//
// alpha - input 2 in the format |Re0 Im0|
// We assume x2 with a dynamic of 15 bit maximum
//
// y - output in the format |Re0 Im0 Re0 Im0|,......,|Re(N-1) Im(N-1) Re(N-1) Im(N-1)|
//
// N - the size f the vectors (this function does N cpx mpy. WARNING: N>=4;
//
// output_shift - shift at output to return in Q1.15
// format - 0 means alpha is in shuffled format, 1 means x is in shuffled format
uint32_t i; // loop counter
register __m128i m0,m1;
__m128i *x_128;
__m128i *y_128;
shift = _mm_cvtsi32_si128(output_shift);
x_128 = (__m128i *)&x[0];
if (format==0) { // alpha is in shuffled format for complex multiply
((int16_t *)&alpha_128)[0] = alpha[0];
((int16_t *)&alpha_128)[1] = -alpha[1];
((int16_t *)&alpha_128)[2] = alpha[1];
((int16_t *)&alpha_128)[3] = alpha[0];
((int16_t *)&alpha_128)[4] = alpha[0];
((int16_t *)&alpha_128)[5] = -alpha[1];
((int16_t *)&alpha_128)[6] = alpha[1];
((int16_t *)&alpha_128)[7] = alpha[0];
} else { // input is in shuffled format for complex multiply
((int16_t *)&alpha_128)[0] = alpha[0];
((int16_t *)&alpha_128)[1] = alpha[1];
((int16_t *)&alpha_128)[2] = alpha[0];
((int16_t *)&alpha_128)[3] = alpha[1];
((int16_t *)&alpha_128)[4] = alpha[0];
((int16_t *)&alpha_128)[5] = alpha[1];
((int16_t *)&alpha_128)[6] = alpha[0];
((int16_t *)&alpha_128)[7] = alpha[1];
}
y_128 = (__m128i *)&y[0];
// _mm_empty();
// return(0);
// we compute 4 cpx multiply for each loop
for(i=0; i<(N>>3); i++) {
m0 = _mm_madd_epi16(x_128[0],alpha_128); //pmaddwd_r2r(mm1,mm0); // 1- compute x1[0]*x2[0]
m0 = _mm_sra_epi32(m0,shift); // 1- shift right by shift in order to compensate for the input amplitude
m1=m0;
m0 = _mm_packs_epi32(m1,m0); // 1- pack in a 128 bit register [re im re im]
y_128[0] = _mm_unpacklo_epi32(m0,m0); // 1- pack in a 128 bit register [re im re im]
m0 = _mm_madd_epi16(x_128[1],alpha_128); //pmaddwd_r2r(mm1,mm0); // 1- compute x1[0]*x2[0]
m0 = _mm_sra_epi32(m0,shift); // 1- shift right by shift in order to compensate for the input amplitude
m1 = m0;
m1 = _mm_packs_epi32(m1,m0); // 1- pack in a 128 bit register [re im re im]
y_128[1] = _mm_unpacklo_epi32(m1,m1); // 1- pack in a 128 bit register [re im re im]
m0 = _mm_madd_epi16(x_128[2],alpha_128); //pmaddwd_r2r(mm1,mm0); // 1- compute x1[0]*x2[0]
m0 = _mm_sra_epi32(m0,shift); // 1- shift right by shift in order to compensate for the input amplitude
m1 = m0;
m1 = _mm_packs_epi32(m1,m0); // 1- pack in a 128 bit register [re im re im]
y_128[2] = _mm_unpacklo_epi32(m1,m1); // 1- pack in a 128 bit register [re im re im]
m0 = _mm_madd_epi16(x_128[3],alpha_128); //pmaddwd_r2r(mm1,mm0); // 1- compute x1[0]*x2[0]
m0 = _mm_sra_epi32(m0,shift); // 1- shift right by shift in order to compensate for the input amplitude
m1 = m0;
m1 = _mm_packs_epi32(m1,m0); // 1- pack in a 128 bit register [re im re im]
y_128[3] = _mm_unpacklo_epi32(m1,m1); // 1- pack in a 128 bit register [re im re im]
if (format==1) { // Put output in proper format (Re,-Im,Im,Re), shuffle = (0,1,3,2) = 0x1e
y_128[0] = _mm_shufflelo_epi16(y_128[0],0x1e);
y_128[0] = _mm_shufflehi_epi16(y_128[0],0x1e);
((int16_t*)&y_128[0])[1] = -((int16_t*)&y_128[0])[1];
((int16_t*)&y_128[0])[5] = -((int16_t*)&y_128[0])[5];
y_128[1] = _mm_shufflelo_epi16(y_128[1],0x1e);
y_128[1] = _mm_shufflehi_epi16(y_128[1],0x1e);
((int16_t*)&y_128[1])[1] = -((int16_t*)&y_128[1])[1];
((int16_t*)&y_128[1])[5] = -((int16_t*)&y_128[1])[5];
y_128[2] = _mm_shufflelo_epi16(y_128[2],0x1e);
y_128[2] = _mm_shufflehi_epi16(y_128[2],0x1e);
((int16_t*)&y_128[2])[1] = -((int16_t*)&y_128[2])[1];
((int16_t*)&y_128[2])[5] = -((int16_t*)&y_128[2])[5];
y_128[3] = _mm_shufflelo_epi16(y_128[3],0x1e);
y_128[3] = _mm_shufflehi_epi16(y_128[3],0x1e);
((int16_t*)&y_128[3])[1] = -((int16_t*)&y_128[3])[1];
((int16_t*)&y_128[3])[5] = -((int16_t*)&y_128[3])[5];
}
x_128+=4;
y_128 +=4;
}
_mm_empty();
_m_empty();
return(0);
}
int rotate_cpx_vector2(int16_t *x,
int16_t *alpha,
int16_t *y,
uint32_t N,
uint16_t output_shift,
uint8_t format)
{
{
// Multiply elementwise two complex vectors of N elements
// multiply a complex vector with a complex value (alpha)
// x - input 1 in the format |Re0 Im0 Re0 Im0|,......,|Re(N-1) Im(N-1) Re(N-1) Im(N-1)|
// stores result in y
// We assume x1 with a dynamic of 15 bit maximum
// N is the number of complex numbers
//
// output_shift reduces the result of the multiplication by this number of bits
// alpha - input 2 in the format |Re0 Im0|
AssertFatal
(
N
%
8
==
0
,
"To be developped"
);
// We assume x2 with a dynamic of 15 bit maximum
const
c16_t
for_re
=
{
alpha
[
0
],
-
alpha
[
1
]};
//
__m256i
const
alpha_for_real
=
_mm256_set1_epi32
(
*
(
uint32_t
*
)
&
for_re
);
// y - output in the format |Re0 Im0 Re0 Im0|,......,|Re(N-1) Im(N-1) Re(N-1) Im(N-1)|
const
c16_t
for_im
=
{
alpha
[
1
],
alpha
[
0
]};
//
__m256i
const
alpha_for_im
=
_mm256_set1_epi32
(
*
(
uint32_t
*
)
&
for_im
);
// N - the size f the vectors (this function does N cpx mpy. WARNING: N>=4;
__m256i
const
perm_mask
=
//
_mm256_set_epi8
(
31
,
30
,
23
,
22
,
29
,
28
,
21
,
20
,
27
,
26
,
19
,
18
,
25
,
24
,
17
,
16
,
// log2_amp - increase the output amplitude by a factor 2^log2_amp (default is 0)
15
,
14
,
7
,
6
,
13
,
12
,
5
,
4
,
11
,
10
,
3
,
2
,
9
,
8
,
1
,
0
);
// WARNING: log2_amp>0 can cause overflow!!
__m256i
*
xd
=
(
__m256i
*
)
x
;
const
__m256i
*
end
=
xd
+
N
/
8
;
uint32_t i; // loop counter
for
(
__m256i
*
yd
=
(
__m256i
*
)
y
;
xd
<
end
;
yd
++
,
xd
++
)
{
const
__m256i
xre
=
_mm256_srai_epi32
(
_mm256_madd_epi16
(
*
xd
,
alpha_for_real
),
register __m128i m0,m1;
output_shift
);
const
__m256i
xim
=
_mm256_srai_epi32
(
_mm256_madd_epi16
(
*
xd
,
alpha_for_im
),
output_shift
);
__m128i *x_128;
// a bit faster than unpacklo+unpackhi+packs
__m128i *y_128;
const
__m256i
tmp
=
_mm256_packs_epi32
(
xre
,
xim
);
*
yd
=
_mm256_shuffle_epi8
(
tmp
,
perm_mask
);
shift = _mm_cvtsi32_si128(output_shift);
x_128 = (__m128i *)&x[0];
if (format==0) { // alpha is in shuffled format for complex multiply
((int16_t *)&alpha_128)[0] = alpha[0];
((int16_t *)&alpha_128)[1] = -alpha[1];
((int16_t *)&alpha_128)[2] = alpha[1];
((int16_t *)&alpha_128)[3] = alpha[0];
((int16_t *)&alpha_128)[4] = alpha[0];
((int16_t *)&alpha_128)[5] = -alpha[1];
((int16_t *)&alpha_128)[6] = alpha[1];
((int16_t *)&alpha_128)[7] = alpha[0];
} else { // input is in shuffled format for complex multiply
((int16_t *)&alpha_128)[0] = alpha[0];
((int16_t *)&alpha_128)[1] = alpha[1];
((int16_t *)&alpha_128)[2] = alpha[0];
((int16_t *)&alpha_128)[3] = alpha[1];
((int16_t *)&alpha_128)[4] = alpha[0];
((int16_t *)&alpha_128)[5] = alpha[1];
((int16_t *)&alpha_128)[6] = alpha[0];
((int16_t *)&alpha_128)[7] = alpha[1];
}
}
y_128 = (__m128i *)&y[0];
// we compute 4 cpx multiply for each loop
for(i=0; i<(N>>1); i++) {
m0 = _mm_madd_epi16(x_128[i],alpha_128); //pmaddwd_r2r(mm1,mm0); // 1- compute x1[0]*x2[0]
m0 = _mm_sra_epi32(m0,shift); // 1- shift right by shift in order to compensate for the input amplitude
m1=m0;
m1 = _mm_packs_epi32(m1,m0); // 1- pack in a 128 bit register [re im re im]
y_128[i] = _mm_unpacklo_epi32(m1,m1); // 1- pack in a 128 bit register [re im re im]
if (format==1) { // Put output in proper format (Re,-Im,Im,Re), shuffle = (0,1,3,2) = 0x1e
y_128[i] = _mm_shufflelo_epi16(y_128[i],0x1e);
y_128[i] = _mm_shufflehi_epi16(y_128[i],0x1e);
((int16_t*)&y_128[i])[1] = -((int16_t*)&y_128[i])[1];
((int16_t*)&y_128[i])[5] = -((int16_t*)&y_128[i])[5];
}
}
_mm_empty();
_m_empty();
return(0);
}
}
*/
#else
void
rotate_cpx_vector
(
int16_t
*
x
,
int
rotate_cpx_vector
(
int16_t
*
x
,
int16_t
*
alpha
,
int16_t
*
alpha
,
int16_t
*
y
,
int16_t
*
y
,
uint32_t
N
,
uint32_t
N
,
...
@@ -439,9 +273,9 @@ int rotate_cpx_vector(int16_t *x,
...
@@ -439,9 +273,9 @@ int rotate_cpx_vector(int16_t *x,
_mm_empty
();
_mm_empty
();
_m_empty
();
_m_empty
();
return
(
0
)
;
return
;
}
}
#endif
/*
/*
int mult_vector32_scalar(int16_t *x1,
int mult_vector32_scalar(int16_t *x1,
int x2,
int x2,
...
...
openair1/PHY/TOOLS/tools_defs.h
View file @
bf219fc2
...
@@ -37,6 +37,7 @@ extern "C" {
...
@@ -37,6 +37,7 @@ extern "C" {
#include <stdint.h>
#include <stdint.h>
#include <assert.h>
#include <assert.h>
#include "PHY/sse_intrin.h"
#include "PHY/sse_intrin.h"
#include "common/utils/assertions.h"
#define CEILIDIV(a,b) ((a+b-1)/b)
#define CEILIDIV(a,b) ((a+b-1)/b)
#define ROUNDIDIV(a,b) (((a<<1)+b)/(b<<1))
#define ROUNDIDIV(a,b) (((a<<1)+b)/(b<<1))
...
@@ -104,15 +105,6 @@ void multadd_complex_vector_real_scalar(int16_t *x,
...
@@ -104,15 +105,6 @@ void multadd_complex_vector_real_scalar(int16_t *x,
uint8_t
zero_flag
,
uint8_t
zero_flag
,
uint32_t
N
);
uint32_t
N
);
int
rotate_cpx_vector
(
int16_t
*
x
,
int16_t
*
alpha
,
int16_t
*
y
,
uint32_t
N
,
uint16_t
output_shift
);
/*!\fn void init_fft(uint16_t size,uint8_t logsize,uint16_t *rev)
/*!\fn void init_fft(uint16_t size,uint8_t logsize,uint16_t *rev)
\brief Initialize the FFT engine for a given size
\brief Initialize the FFT engine for a given size
@param size Size of the FFT
@param size Size of the FFT
...
@@ -471,7 +463,7 @@ This function performs componentwise multiplication of a vector with a complex s
...
@@ -471,7 +463,7 @@ This function performs componentwise multiplication of a vector with a complex s
The function implemented is : \f$\mathbf{y} = \alpha\mathbf{x}\f$
The function implemented is : \f$\mathbf{y} = \alpha\mathbf{x}\f$
*/
*/
int32_t
rotate_cpx_vector
(
int16_t
*
x
,
void
rotate_cpx_vector
(
int16_t
*
x
,
int16_t
*
alpha
,
int16_t
*
alpha
,
int16_t
*
y
,
int16_t
*
y
,
uint32_t
N
,
uint32_t
N
,
...
...
Write
Preview
Markdown
is supported
0%
Try again
or
attach a new file
Attach a file
Cancel
You are about to add
0
people
to the discussion. Proceed with caution.
Finish editing this message first!
Cancel
Please
register
or
sign in
to comment