Skip to content
Projects
Groups
Snippets
Help
Loading...
Help
Support
Keyboard shortcuts
?
Submit feedback
Contribute to GitLab
Sign in / Register
Toggle navigation
O
OpenXG-RAN
Project overview
Project overview
Details
Activity
Releases
Repository
Repository
Files
Commits
Branches
Tags
Contributors
Graph
Compare
Issues
0
Issues
0
List
Boards
Labels
Milestones
Merge Requests
0
Merge Requests
0
CI / CD
CI / CD
Pipelines
Jobs
Schedules
Analytics
Analytics
CI / CD
Repository
Value Stream
Wiki
Wiki
Snippets
Snippets
Members
Members
Collapse sidebar
Close sidebar
Activity
Graph
Create a new issue
Jobs
Commits
Issue Boards
Open sidebar
常顺宇
OpenXG-RAN
Commits
015187e5
Commit
015187e5
authored
Jan 23, 2016
by
Raymond Knopp
Browse files
Options
Browse Files
Download
Email Patches
Plain Diff
AVX2 optimization activated for gamma computation in 16-bit turbo decoder (single-codeword)
parent
8d4405bd
Changes
7
Expand all
Hide whitespace changes
Inline
Side-by-side
Showing
7 changed files
with
289 additions
and
78 deletions
+289
-78
cmake_targets/CMakeLists.txt
cmake_targets/CMakeLists.txt
+1
-1
openair1/PHY/CODING/3gpplte_sse.c
openair1/PHY/CODING/3gpplte_sse.c
+1
-0
openair1/PHY/CODING/3gpplte_turbo_decoder_avx2_16bit.c
openair1/PHY/CODING/3gpplte_turbo_decoder_avx2_16bit.c
+95
-51
openair1/PHY/CODING/3gpplte_turbo_decoder_sse_16bit.c
openair1/PHY/CODING/3gpplte_turbo_decoder_sse_16bit.c
+184
-15
openair1/PHY/LTE_REFSIG/lte_gold.c
openair1/PHY/LTE_REFSIG/lte_gold.c
+1
-5
openair1/PHY/LTE_TRANSPORT/dlsch_decoding.c
openair1/PHY/LTE_TRANSPORT/dlsch_decoding.c
+2
-1
openair1/PHY/LTE_TRANSPORT/dlsch_demodulation.c
openair1/PHY/LTE_TRANSPORT/dlsch_demodulation.c
+5
-5
No files found.
cmake_targets/CMakeLists.txt
View file @
015187e5
...
...
@@ -134,7 +134,7 @@ else (CMAKE_SYSTEM_PROCESSOR STREQUAL "armv7l")
set
(
C_FLAGS_PROCESSOR
"
${
C_FLAGS_PROCESSOR
}
-mavx2"
)
endif
()
if
(
CPUINFO MATCHES
"sse4_2"
)
set
(
C_FLAGS_PROCESSOR
"
${
C_FLAGS_PROCESSOR
}
-mavx2 -msse4.2"
)
set
(
C_FLAGS_PROCESSOR
"
${
C_FLAGS_PROCESSOR
}
-mavx2 -msse4.2
-fno-tree-vectorize
"
)
endif
()
if
(
CPUINFO MATCHES
"sse4_1"
)
set
(
C_FLAGS_PROCESSOR
"
${
C_FLAGS_PROCESSOR
}
-msse4.1"
)
...
...
openair1/PHY/CODING/3gpplte_sse.c
View file @
015187e5
...
...
@@ -223,6 +223,7 @@ char interleave_compact_byte(short * base_interleaver,unsigned char * input, uns
0
b00000001
};
#endif
#ifndef __AVX2__
if
((
n
&
15
)
>
0
)
loop
++
;
...
...
openair1/PHY/CODING/3gpplte_turbo_decoder_avx2_16bit.c
View file @
015187e5
...
...
@@ -186,12 +186,16 @@ void compute_alpha16avx2(llr_t* alpha,llr_t* beta,llr_t* m_11,llr_t* m_10,uint16
__m256i
new0
,
new1
,
new2
,
new3
,
new4
,
new5
,
new6
,
new7
;
__m256i
alpha_max
;
unsigned
long
long
timein
,
timeout
;
l2
=
L
>>
3
;
K1
=
(
frame_length
>>
3
);
#ifdef DEBUG_LOGMAP
fprintf
(
fdavx2
,
"Compute alpha (avx2_16bit)
\n
"
);
fprintf
(
fdavx2b
,
"Compute alpha (avx2_16bit)
\n
"
);
#endif
timein
=
rdtsc_oai
();
for
(
l
=
K1
;;
l
=
l2
,
rerun_flag
=
1
)
{
alpha128
=
(
__m256i
*
)
alpha
;
...
...
@@ -378,6 +382,9 @@ void compute_alpha16avx2(llr_t* alpha,llr_t* beta,llr_t* m_11,llr_t* m_10,uint16
if
(
rerun_flag
==
1
)
break
;
}
timeout
=
rdtsc_oai
();
printf
(
"alpha: inner loop time %llu
\n
"
,
timeout
-
timein
);
}
...
...
@@ -386,9 +393,10 @@ void compute_beta16avx2(llr_t* alpha,llr_t* beta,llr_t *m_11,llr_t* m_10,uint16_
int
k
,
rerun_flag
=
0
;
__m256i
m11_128
,
m10_128
;
__m256i
m_b0
,
m_b1
,
m_b2
,
m_b3
,
m_b4
,
m_b5
,
m_b6
,
m_b7
;
__m256i
new0
,
new1
,
new2
,
new3
,
new4
,
new5
,
new6
,
new7
;
__m256i
*
m11p
,
*
m10p
;
register
__m256i
b0
,
b1
,
b2
,
b3
,
b4
,
b5
,
b6
,
b7
;
register
__m256i
m_b0
,
m_b1
,
m_b2
,
m_b3
,
m_b4
,
m_b5
,
m_b6
,
m_b7
;
register
__m256i
new0
,
new1
,
new2
,
new3
,
new4
,
new5
,
new6
,
new7
;
__m256i
*
beta128
,
*
alpha128
,
*
beta_ptr
;
__m256i
beta_max
;
...
...
@@ -398,6 +406,8 @@ void compute_beta16avx2(llr_t* alpha,llr_t* beta,llr_t *m_11,llr_t* m_10,uint16_
llr_t
beta0
,
beta1
;
llr_t
beta0_cw2
,
beta1_cw2
;
unsigned
long
long
timein
,
timeout
;
#ifdef DEBUG_LOGMAP
fprintf
(
fdavx2
,
"compute_beta (avx2_16bit), %p,%p,%p,%p,framelength %d,F %d
\n
"
,
beta
,
m_11
,
m_10
,
alpha
,
frame_length
,
F
);
...
...
@@ -590,56 +600,74 @@ void compute_beta16avx2(llr_t* alpha,llr_t* beta,llr_t *m_11,llr_t* m_10,uint16_
#endif
int
loopval
=
((
rerun_flag
==
0
)
?
0
:
((
frame_length
-
L
)
>>
3
));
printf
(
"beta: rerun %d => loopval %d
\n
"
,
rerun_flag
,
loopval
);
timein
=
rdtsc_oai
();
m11p
=
(
frame_length
>>
3
)
-
1
+
(
__m256i
*
)
m_11
;
m10p
=
(
frame_length
>>
3
)
-
1
+
(
__m256i
*
)
m_10
;
for
(
k
=
(
frame_length
>>
3
)
-
1
;
k
>=
loopval
;
k
--
)
{
m11_128
=
((
__m256i
*
)
m_11
)[
k
];
m10_128
=
((
__m256i
*
)
m_10
)[
k
];
m_b0
=
_mm256_adds_epi16
(
beta_ptr
[
4
],
m11_128
);
//m11
m_b1
=
_mm256_subs_epi16
(
beta_ptr
[
4
],
m11_128
);
//m00
m_b2
=
_mm256_subs_epi16
(
beta_ptr
[
5
],
m10_128
);
//m01
m_b3
=
_mm256_adds_epi16
(
beta_ptr
[
5
],
m10_128
);
//m10
m_b4
=
_mm256_adds_epi16
(
beta_ptr
[
6
],
m10_128
);
//m10
m_b5
=
_mm256_subs_epi16
(
beta_ptr
[
6
],
m10_128
);
//m01
m_b6
=
_mm256_subs_epi16
(
beta_ptr
[
7
],
m11_128
);
//m00
m_b7
=
_mm256_adds_epi16
(
beta_ptr
[
7
],
m11_128
);
//m11
new0
=
_mm256_subs_epi16
(
beta_ptr
[
0
],
m11_128
);
//m00
new1
=
_mm256_adds_epi16
(
beta_ptr
[
0
],
m11_128
);
//m11
new2
=
_mm256_adds_epi16
(
beta_ptr
[
1
],
m10_128
);
//m10
new3
=
_mm256_subs_epi16
(
beta_ptr
[
1
],
m10_128
);
//m01
new4
=
_mm256_subs_epi16
(
beta_ptr
[
2
],
m10_128
);
//m01
new5
=
_mm256_adds_epi16
(
beta_ptr
[
2
],
m10_128
);
//m10
new6
=
_mm256_adds_epi16
(
beta_ptr
[
3
],
m11_128
);
//m11
new7
=
_mm256_subs_epi16
(
beta_ptr
[
3
],
m11_128
);
//m00
b4
=
_mm256_load_si256
(
&
beta_ptr
[
4
]);
b5
=
_mm256_load_si256
(
&
beta_ptr
[
5
]);
b6
=
_mm256_load_si256
(
&
beta_ptr
[
6
]);
b7
=
_mm256_load_si256
(
&
beta_ptr
[
7
]);
m_b0
=
_mm256_adds_epi16
(
b4
,
*
m11p
);
//m11
m_b1
=
_mm256_subs_epi16
(
b4
,
*
m11p
);
//m00
m_b2
=
_mm256_subs_epi16
(
b5
,
*
m10p
);
//m01
m_b3
=
_mm256_adds_epi16
(
b5
,
*
m10p
);
//m10
m_b4
=
_mm256_adds_epi16
(
b6
,
*
m10p
);
//m10
m_b5
=
_mm256_subs_epi16
(
b6
,
*
m10p
);
//m01
m_b6
=
_mm256_subs_epi16
(
b7
,
*
m11p
);
//m00
m_b7
=
_mm256_adds_epi16
(
b7
,
*
m11p
);
//m11
b0
=
_mm256_load_si256
(
&
beta_ptr
[
0
]);
b1
=
_mm256_load_si256
(
&
beta_ptr
[
1
]);
b2
=
_mm256_load_si256
(
&
beta_ptr
[
2
]);
b3
=
_mm256_load_si256
(
&
beta_ptr
[
3
]);
new0
=
_mm256_subs_epi16
(
b0
,
*
m11p
);
//m00
new1
=
_mm256_adds_epi16
(
b0
,
*
m11p
);
//m11
new2
=
_mm256_adds_epi16
(
b1
,
*
m10p
);
//m10
new3
=
_mm256_subs_epi16
(
b1
,
*
m10p
);
//m01
new4
=
_mm256_subs_epi16
(
b2
,
*
m10p
);
//m01
new5
=
_mm256_adds_epi16
(
b2
,
*
m10p
);
//m10
new6
=
_mm256_adds_epi16
(
b3
,
*
m11p
);
//m11
new7
=
_mm256_subs_epi16
(
b3
,
*
m11p
);
//m00
b0
=
_mm256_max_epi16
(
m_b0
,
new0
);
b1
=
_mm256_max_epi16
(
m_b1
,
new1
);
b2
=
_mm256_max_epi16
(
m_b2
,
new2
);
b3
=
_mm256_max_epi16
(
m_b3
,
new3
);
b4
=
_mm256_max_epi16
(
m_b4
,
new4
);
b5
=
_mm256_max_epi16
(
m_b5
,
new5
);
b6
=
_mm256_max_epi16
(
m_b6
,
new6
);
b7
=
_mm256_max_epi16
(
m_b7
,
new7
);
beta_max
=
_mm256_max_epi16
(
b0
,
b1
);
beta_max
=
_mm256_max_epi16
(
beta_max
,
b2
);
beta_max
=
_mm256_max_epi16
(
beta_max
,
b3
);
beta_max
=
_mm256_max_epi16
(
beta_max
,
b4
);
beta_max
=
_mm256_max_epi16
(
beta_max
,
b5
);
beta_max
=
_mm256_max_epi16
(
beta_max
,
b6
);
beta_max
=
_mm256_max_epi16
(
beta_max
,
b7
);
beta_ptr
-=
8
;
beta_ptr
[
0
]
=
_mm256_max_epi16
(
m_b0
,
new0
);
beta_ptr
[
1
]
=
_mm256_max_epi16
(
m_b1
,
new1
);
beta_ptr
[
2
]
=
_mm256_max_epi16
(
m_b2
,
new2
);
beta_ptr
[
3
]
=
_mm256_max_epi16
(
m_b3
,
new3
);
beta_ptr
[
4
]
=
_mm256_max_epi16
(
m_b4
,
new4
);
beta_ptr
[
5
]
=
_mm256_max_epi16
(
m_b5
,
new5
);
beta_ptr
[
6
]
=
_mm256_max_epi16
(
m_b6
,
new6
);
beta_ptr
[
7
]
=
_mm256_max_epi16
(
m_b7
,
new7
);
beta_max
=
_mm256_max_epi16
(
beta_ptr
[
0
],
beta_ptr
[
1
]);
beta_max
=
_mm256_max_epi16
(
beta_max
,
beta_ptr
[
2
]);
beta_max
=
_mm256_max_epi16
(
beta_max
,
beta_ptr
[
3
]);
beta_max
=
_mm256_max_epi16
(
beta_max
,
beta_ptr
[
4
]);
beta_max
=
_mm256_max_epi16
(
beta_max
,
beta_ptr
[
5
]);
beta_max
=
_mm256_max_epi16
(
beta_max
,
beta_ptr
[
6
]);
beta_max
=
_mm256_max_epi16
(
beta_max
,
beta_ptr
[
7
]);
beta_ptr
[
0
]
=
_mm256_subs_epi16
(
beta_ptr
[
0
],
beta_max
);
beta_ptr
[
1
]
=
_mm256_subs_epi16
(
beta_ptr
[
1
],
beta_max
);
beta_ptr
[
2
]
=
_mm256_subs_epi16
(
beta_ptr
[
2
],
beta_max
);
beta_ptr
[
3
]
=
_mm256_subs_epi16
(
beta_ptr
[
3
],
beta_max
);
beta_ptr
[
4
]
=
_mm256_subs_epi16
(
beta_ptr
[
4
],
beta_max
);
beta_ptr
[
5
]
=
_mm256_subs_epi16
(
beta_ptr
[
5
],
beta_max
);
beta_ptr
[
6
]
=
_mm256_subs_epi16
(
beta_ptr
[
6
],
beta_max
);
beta_ptr
[
7
]
=
_mm256_subs_epi16
(
beta_ptr
[
7
],
beta_max
);
m11p
--
;
m10p
--
;
beta_ptr
[
0
]
=
_mm256_subs_epi16
(
b0
,
beta_max
);
beta_ptr
[
1
]
=
_mm256_subs_epi16
(
b1
,
beta_max
);
beta_ptr
[
2
]
=
_mm256_subs_epi16
(
b2
,
beta_max
);
beta_ptr
[
3
]
=
_mm256_subs_epi16
(
b3
,
beta_max
);
beta_ptr
[
4
]
=
_mm256_subs_epi16
(
b4
,
beta_max
);
beta_ptr
[
5
]
=
_mm256_subs_epi16
(
b5
,
beta_max
);
beta_ptr
[
6
]
=
_mm256_subs_epi16
(
b6
,
beta_max
);
beta_ptr
[
7
]
=
_mm256_subs_epi16
(
b7
,
beta_max
);
#ifdef DEBUG_LOGMAP
fprintf
(
fdavx2
,
"Loop index %d, mb
\n
"
,
k
);
...
...
@@ -658,6 +686,8 @@ void compute_beta16avx2(llr_t* alpha,llr_t* beta,llr_t *m_11,llr_t* m_10,uint16_
#endif
}
timeout
=
rdtsc_oai
();
printf
(
"beta: inner loop time %llu
\n
"
,
timeout
-
timein
);
if
(
rerun_flag
==
1
)
break
;
...
...
@@ -968,7 +998,7 @@ unsigned char phy_threegpplte_turbo_decoder16avx2(int16_t *y,
yp2
=
yparity2
;
#if 0
for (i=0; i<n; i+=8) {
pi2_p = &pi2tab16avx2[iind][i];
...
...
@@ -1084,9 +1114,23 @@ unsigned char phy_threegpplte_turbo_decoder16avx2(int16_t *y,
yp128_cw2+=3;
}
yp=(llr_t*)yp128;
yp_cw2=(llr_t*)yp128_cw2;
#else
pi2_p
=
&
pi2tab16avx2
[
iind
][
0
];
for
(
i
=
0
,
j
=
0
;
i
<
n
;
i
++
)
{
s
[
*
pi2_p
]
=
y
[
j
];
s
[
*
pi2_p
+
8
]
=
y2
[
j
++
];
yp1
[
*
pi2_p
]
=
y
[
j
];
yp1
[
*
pi2_p
+
8
]
=
y2
[
j
++
];
yp2
[
*
pi2_p
]
=
y
[
j
];
yp2
[(
*
pi2_p
++
)
+
8
]
=
y2
[
j
++
];
}
yp
=
(
llr_t
*
)
&
y
[
j
];
yp_cw2
=
(
llr_t
*
)
&
y2
[
j
];
#endif
// Termination
for
(
i
=
0
;
i
<
3
;
i
++
)
{
...
...
openair1/PHY/CODING/3gpplte_turbo_decoder_sse_16bit.c
View file @
015187e5
This diff is collapsed.
Click to expand it.
openair1/PHY/LTE_REFSIG/lte_gold.c
View file @
015187e5
...
...
@@ -61,21 +61,18 @@ void lte_gold(LTE_DL_FRAME_PARMS *frame_parms,uint32_t lte_gold_table[20][2][14]
x2
=
Ncp
+
(
Nid_cell
<<
1
)
+
(((
1
+
(
Nid_cell
<<
1
))
*
(
1
+
(((
frame_parms
->
Ncp
==
0
)
?
4
:
3
)
*
l
)
+
(
7
*
(
1
+
ns
))))
<<
10
);
//cinit
(((
1
+
(
Nid_cell
<<
1
))
*
(
1
+
(((
frame_parms
->
Ncp
==
0
)
?
4
:
3
)
*
l
)
+
(
7
*
(
1
+
ns
))))
<<
10
);
//cinit
//x2 = frame_parms->Ncp + (Nid_cell<<1) + (1+(Nid_cell<<1))*(1 + (3*l) + (7*(1+ns))); //cinit
//n = 0
// printf("cinit (ns %d, l %d) => %d\n",ns,l,x2);
x1
=
1
+
(
1
<<
31
);
x2
=
x2
^
((
x2
^
(
x2
>>
1
)
^
(
x2
>>
2
)
^
(
x2
>>
3
))
<<
31
);
// skip first 50 double words (1600 bits)
//printf("n=0 : x1 %x, x2 %x\n",x1,x2);
for
(
n
=
1
;
n
<
50
;
n
++
)
{
x1
=
(
x1
>>
1
)
^
(
x1
>>
4
);
x1
=
x1
^
(
x1
<<
31
)
^
(
x1
<<
28
);
x2
=
(
x2
>>
1
)
^
(
x2
>>
2
)
^
(
x2
>>
3
)
^
(
x2
>>
4
);
x2
=
x2
^
(
x2
<<
31
)
^
(
x2
<<
30
)
^
(
x2
<<
29
)
^
(
x2
<<
28
);
// printf("x1 : %x, x2 : %x\n",x1,x2);
}
for
(
n
=
0
;
n
<
14
;
n
++
)
{
...
...
@@ -84,7 +81,6 @@ void lte_gold(LTE_DL_FRAME_PARMS *frame_parms,uint32_t lte_gold_table[20][2][14]
x2
=
(
x2
>>
1
)
^
(
x2
>>
2
)
^
(
x2
>>
3
)
^
(
x2
>>
4
);
x2
=
x2
^
(
x2
<<
31
)
^
(
x2
<<
30
)
^
(
x2
<<
29
)
^
(
x2
<<
28
);
lte_gold_table
[
ns
][
l
][
n
]
=
x1
^
x2
;
// printf("n=%d : c %x\n",n,x1^x2);
}
}
...
...
openair1/PHY/LTE_TRANSPORT/dlsch_decoding.c
View file @
015187e5
...
...
@@ -446,7 +446,8 @@ uint32_t dlsch_decoding(PHY_VARS_UE *phy_vars_ue,
printf("\n");
*/
#ifndef __AVX2__
//#ifndef __AVX2__
#if 1
if
(
err_flag
==
0
)
{
start_meas
(
dlsch_turbo_decoding_stats
);
...
...
openair1/PHY/LTE_TRANSPORT/dlsch_demodulation.c
View file @
015187e5
...
...
@@ -1898,17 +1898,17 @@ void dlsch_channel_compensation_TM3(LTE_DL_FRAME_PARMS *frame_parms,
for
(
aarx
=
0
;
aarx
<
frame_parms
->
nb_antennas_rx
;
aarx
++
)
{
dl_ch0_128
=
(
__m128i
*
)
&
dl_ch_estimates_ext
[
aarx
][
symbol
*
frame_parms
->
N_RB_DL
*
12
];
dl_ch1_128
=
(
__m128i
*
)
&
dl_ch_estimates_ext
[
2
+
aarx
][
symbol
*
frame_parms
->
N_RB_DL
*
12
];
dl_ch0_128
=
(
__m128i
*
)
&
dl_ch_estimates_ext
[
aarx
][
symbol
*
frame_parms
->
N_RB_DL
*
12
];
// hr,0
dl_ch1_128
=
(
__m128i
*
)
&
dl_ch_estimates_ext
[
2
+
aarx
][
symbol
*
frame_parms
->
N_RB_DL
*
12
];
// hr,1
dl_ch_mag0_128
=
(
__m128i
*
)
&
dl_ch_mag0
[
aarx
][
symbol
*
frame_parms
->
N_RB_DL
*
12
];
dl_ch_mag0_128b
=
(
__m128i
*
)
&
dl_ch_magb0
[
aarx
][
symbol
*
frame_parms
->
N_RB_DL
*
12
];
dl_ch_mag1_128
=
(
__m128i
*
)
&
dl_ch_mag1
[
aarx
][
symbol
*
frame_parms
->
N_RB_DL
*
12
];
dl_ch_mag1_128b
=
(
__m128i
*
)
&
dl_ch_magb1
[
aarx
][
symbol
*
frame_parms
->
N_RB_DL
*
12
];
rxdataF128
=
(
__m128i
*
)
&
rxdataF_ext
[
aarx
][
symbol
*
frame_parms
->
N_RB_DL
*
12
];
rxdataF_comp0_128
=
(
__m128i
*
)
&
rxdataF_comp0
[
aarx
][
symbol
*
frame_parms
->
N_RB_DL
*
12
];
rxdataF_comp1_128
=
(
__m128i
*
)
&
rxdataF_comp1
[
aarx
][
symbol
*
frame_parms
->
N_RB_DL
*
12
];
rxdataF128
=
(
__m128i
*
)
&
rxdataF_ext
[
aarx
][
symbol
*
frame_parms
->
N_RB_DL
*
12
];
// yr
rxdataF_comp0_128
=
(
__m128i
*
)
&
rxdataF_comp0
[
aarx
][
symbol
*
frame_parms
->
N_RB_DL
*
12
];
// yr,0 = yr * conj(hr,0)
rxdataF_comp1_128
=
(
__m128i
*
)
&
rxdataF_comp1
[
aarx
][
symbol
*
frame_parms
->
N_RB_DL
*
12
];
// yr,1 = yr * conj(hr,1)
for
(
rb
=
0
;
rb
<
nb_rb
;
rb
++
)
{
...
...
Write
Preview
Markdown
is supported
0%
Try again
or
attach a new file
Attach a file
Cancel
You are about to add
0
people
to the discussion. Proceed with caution.
Finish editing this message first!
Cancel
Please
register
or
sign in
to comment