Skip to content
Projects
Groups
Snippets
Help
Loading...
Help
Support
Keyboard shortcuts
?
Submit feedback
Contribute to GitLab
Sign in / Register
Toggle navigation
O
OpenXG-RAN
Project overview
Project overview
Details
Activity
Releases
Repository
Repository
Files
Commits
Branches
Tags
Contributors
Graph
Compare
Issues
0
Issues
0
List
Boards
Labels
Milestones
Merge Requests
0
Merge Requests
0
CI / CD
CI / CD
Pipelines
Jobs
Schedules
Analytics
Analytics
CI / CD
Repository
Value Stream
Wiki
Wiki
Snippets
Snippets
Members
Members
Collapse sidebar
Close sidebar
Activity
Graph
Create a new issue
Jobs
Commits
Issue Boards
Open sidebar
spbro
OpenXG-RAN
Commits
015187e5
Commit
015187e5
authored
Jan 23, 2016
by
Raymond Knopp
Browse files
Options
Browse Files
Download
Email Patches
Plain Diff
AVX2 optimization activated for gamma computation in 16-bit turbo decoder (single-codeword)
parent
8d4405bd
Changes
7
Show whitespace changes
Inline
Side-by-side
Showing
7 changed files
with
289 additions
and
78 deletions
+289
-78
cmake_targets/CMakeLists.txt
cmake_targets/CMakeLists.txt
+1
-1
openair1/PHY/CODING/3gpplte_sse.c
openair1/PHY/CODING/3gpplte_sse.c
+1
-0
openair1/PHY/CODING/3gpplte_turbo_decoder_avx2_16bit.c
openair1/PHY/CODING/3gpplte_turbo_decoder_avx2_16bit.c
+95
-51
openair1/PHY/CODING/3gpplte_turbo_decoder_sse_16bit.c
openair1/PHY/CODING/3gpplte_turbo_decoder_sse_16bit.c
+184
-15
openair1/PHY/LTE_REFSIG/lte_gold.c
openair1/PHY/LTE_REFSIG/lte_gold.c
+1
-5
openair1/PHY/LTE_TRANSPORT/dlsch_decoding.c
openair1/PHY/LTE_TRANSPORT/dlsch_decoding.c
+2
-1
openair1/PHY/LTE_TRANSPORT/dlsch_demodulation.c
openair1/PHY/LTE_TRANSPORT/dlsch_demodulation.c
+5
-5
No files found.
cmake_targets/CMakeLists.txt
View file @
015187e5
...
...
@@ -134,7 +134,7 @@ else (CMAKE_SYSTEM_PROCESSOR STREQUAL "armv7l")
set
(
C_FLAGS_PROCESSOR
"
${
C_FLAGS_PROCESSOR
}
-mavx2"
)
endif
()
if
(
CPUINFO MATCHES
"sse4_2"
)
set
(
C_FLAGS_PROCESSOR
"
${
C_FLAGS_PROCESSOR
}
-mavx2 -msse4.2"
)
set
(
C_FLAGS_PROCESSOR
"
${
C_FLAGS_PROCESSOR
}
-mavx2 -msse4.2
-fno-tree-vectorize
"
)
endif
()
if
(
CPUINFO MATCHES
"sse4_1"
)
set
(
C_FLAGS_PROCESSOR
"
${
C_FLAGS_PROCESSOR
}
-msse4.1"
)
...
...
openair1/PHY/CODING/3gpplte_sse.c
View file @
015187e5
...
...
@@ -223,6 +223,7 @@ char interleave_compact_byte(short * base_interleaver,unsigned char * input, uns
0
b00000001
};
#endif
#ifndef __AVX2__
if
((
n
&
15
)
>
0
)
loop
++
;
...
...
openair1/PHY/CODING/3gpplte_turbo_decoder_avx2_16bit.c
View file @
015187e5
...
...
@@ -186,12 +186,16 @@ void compute_alpha16avx2(llr_t* alpha,llr_t* beta,llr_t* m_11,llr_t* m_10,uint16
__m256i
new0
,
new1
,
new2
,
new3
,
new4
,
new5
,
new6
,
new7
;
__m256i
alpha_max
;
unsigned
long
long
timein
,
timeout
;
l2
=
L
>>
3
;
K1
=
(
frame_length
>>
3
);
#ifdef DEBUG_LOGMAP
fprintf
(
fdavx2
,
"Compute alpha (avx2_16bit)
\n
"
);
fprintf
(
fdavx2b
,
"Compute alpha (avx2_16bit)
\n
"
);
#endif
timein
=
rdtsc_oai
();
for
(
l
=
K1
;;
l
=
l2
,
rerun_flag
=
1
)
{
alpha128
=
(
__m256i
*
)
alpha
;
...
...
@@ -378,6 +382,9 @@ void compute_alpha16avx2(llr_t* alpha,llr_t* beta,llr_t* m_11,llr_t* m_10,uint16
if
(
rerun_flag
==
1
)
break
;
}
timeout
=
rdtsc_oai
();
printf
(
"alpha: inner loop time %llu
\n
"
,
timeout
-
timein
);
}
...
...
@@ -386,9 +393,10 @@ void compute_beta16avx2(llr_t* alpha,llr_t* beta,llr_t *m_11,llr_t* m_10,uint16_
int
k
,
rerun_flag
=
0
;
__m256i
m11_128
,
m10_128
;
__m256i
m_b0
,
m_b1
,
m_b2
,
m_b3
,
m_b4
,
m_b5
,
m_b6
,
m_b7
;
__m256i
new0
,
new1
,
new2
,
new3
,
new4
,
new5
,
new6
,
new7
;
__m256i
*
m11p
,
*
m10p
;
register
__m256i
b0
,
b1
,
b2
,
b3
,
b4
,
b5
,
b6
,
b7
;
register
__m256i
m_b0
,
m_b1
,
m_b2
,
m_b3
,
m_b4
,
m_b5
,
m_b6
,
m_b7
;
register
__m256i
new0
,
new1
,
new2
,
new3
,
new4
,
new5
,
new6
,
new7
;
__m256i
*
beta128
,
*
alpha128
,
*
beta_ptr
;
__m256i
beta_max
;
...
...
@@ -398,6 +406,8 @@ void compute_beta16avx2(llr_t* alpha,llr_t* beta,llr_t *m_11,llr_t* m_10,uint16_
llr_t
beta0
,
beta1
;
llr_t
beta0_cw2
,
beta1_cw2
;
unsigned
long
long
timein
,
timeout
;
#ifdef DEBUG_LOGMAP
fprintf
(
fdavx2
,
"compute_beta (avx2_16bit), %p,%p,%p,%p,framelength %d,F %d
\n
"
,
beta
,
m_11
,
m_10
,
alpha
,
frame_length
,
F
);
...
...
@@ -590,56 +600,74 @@ void compute_beta16avx2(llr_t* alpha,llr_t* beta,llr_t *m_11,llr_t* m_10,uint16_
#endif
int
loopval
=
((
rerun_flag
==
0
)
?
0
:
((
frame_length
-
L
)
>>
3
));
printf
(
"beta: rerun %d => loopval %d
\n
"
,
rerun_flag
,
loopval
);
timein
=
rdtsc_oai
();
m11p
=
(
frame_length
>>
3
)
-
1
+
(
__m256i
*
)
m_11
;
m10p
=
(
frame_length
>>
3
)
-
1
+
(
__m256i
*
)
m_10
;
for
(
k
=
(
frame_length
>>
3
)
-
1
;
k
>=
loopval
;
k
--
)
{
m11_128
=
((
__m256i
*
)
m_11
)[
k
];
m10_128
=
((
__m256i
*
)
m_10
)[
k
];
m_b0
=
_mm256_adds_epi16
(
beta_ptr
[
4
],
m11_128
);
//m11
m_b1
=
_mm256_subs_epi16
(
beta_ptr
[
4
],
m11_128
);
//m00
m_b2
=
_mm256_subs_epi16
(
beta_ptr
[
5
],
m10_128
);
//m01
m_b3
=
_mm256_adds_epi16
(
beta_ptr
[
5
],
m10_128
);
//m10
m_b4
=
_mm256_adds_epi16
(
beta_ptr
[
6
],
m10_128
);
//m10
m_b5
=
_mm256_subs_epi16
(
beta_ptr
[
6
],
m10_128
);
//m01
m_b6
=
_mm256_subs_epi16
(
beta_ptr
[
7
],
m11_128
);
//m00
m_b7
=
_mm256_adds_epi16
(
beta_ptr
[
7
],
m11_128
);
//m11
new0
=
_mm256_subs_epi16
(
beta_ptr
[
0
],
m11_128
);
//m00
new1
=
_mm256_adds_epi16
(
beta_ptr
[
0
],
m11_128
);
//m11
new2
=
_mm256_adds_epi16
(
beta_ptr
[
1
],
m10_128
);
//m10
new3
=
_mm256_subs_epi16
(
beta_ptr
[
1
],
m10_128
);
//m01
new4
=
_mm256_subs_epi16
(
beta_ptr
[
2
],
m10_128
);
//m01
new5
=
_mm256_adds_epi16
(
beta_ptr
[
2
],
m10_128
);
//m10
new6
=
_mm256_adds_epi16
(
beta_ptr
[
3
],
m11_128
);
//m11
new7
=
_mm256_subs_epi16
(
beta_ptr
[
3
],
m11_128
);
//m00
beta_ptr
-=
8
;
b4
=
_mm256_load_si256
(
&
beta_ptr
[
4
]);
b5
=
_mm256_load_si256
(
&
beta_ptr
[
5
]);
b6
=
_mm256_load_si256
(
&
beta_ptr
[
6
]);
b7
=
_mm256_load_si256
(
&
beta_ptr
[
7
]);
m_b0
=
_mm256_adds_epi16
(
b4
,
*
m11p
);
//m11
m_b1
=
_mm256_subs_epi16
(
b4
,
*
m11p
);
//m00
m_b2
=
_mm256_subs_epi16
(
b5
,
*
m10p
);
//m01
m_b3
=
_mm256_adds_epi16
(
b5
,
*
m10p
);
//m10
m_b4
=
_mm256_adds_epi16
(
b6
,
*
m10p
);
//m10
m_b5
=
_mm256_subs_epi16
(
b6
,
*
m10p
);
//m01
m_b6
=
_mm256_subs_epi16
(
b7
,
*
m11p
);
//m00
m_b7
=
_mm256_adds_epi16
(
b7
,
*
m11p
);
//m11
b0
=
_mm256_load_si256
(
&
beta_ptr
[
0
]);
b1
=
_mm256_load_si256
(
&
beta_ptr
[
1
]);
b2
=
_mm256_load_si256
(
&
beta_ptr
[
2
]);
b3
=
_mm256_load_si256
(
&
beta_ptr
[
3
]);
new0
=
_mm256_subs_epi16
(
b0
,
*
m11p
);
//m00
new1
=
_mm256_adds_epi16
(
b0
,
*
m11p
);
//m11
new2
=
_mm256_adds_epi16
(
b1
,
*
m10p
);
//m10
new3
=
_mm256_subs_epi16
(
b1
,
*
m10p
);
//m01
new4
=
_mm256_subs_epi16
(
b2
,
*
m10p
);
//m01
new5
=
_mm256_adds_epi16
(
b2
,
*
m10p
);
//m10
new6
=
_mm256_adds_epi16
(
b3
,
*
m11p
);
//m11
new7
=
_mm256_subs_epi16
(
b3
,
*
m11p
);
//m00
b0
=
_mm256_max_epi16
(
m_b0
,
new0
);
b1
=
_mm256_max_epi16
(
m_b1
,
new1
);
b2
=
_mm256_max_epi16
(
m_b2
,
new2
);
b3
=
_mm256_max_epi16
(
m_b3
,
new3
);
b4
=
_mm256_max_epi16
(
m_b4
,
new4
);
b5
=
_mm256_max_epi16
(
m_b5
,
new5
);
b6
=
_mm256_max_epi16
(
m_b6
,
new6
);
b7
=
_mm256_max_epi16
(
m_b7
,
new7
);
beta_max
=
_mm256_max_epi16
(
b0
,
b1
);
beta_max
=
_mm256_max_epi16
(
beta_max
,
b2
);
beta_max
=
_mm256_max_epi16
(
beta_max
,
b3
);
beta_max
=
_mm256_max_epi16
(
beta_max
,
b4
);
beta_max
=
_mm256_max_epi16
(
beta_max
,
b5
);
beta_max
=
_mm256_max_epi16
(
beta_max
,
b6
);
beta_max
=
_mm256_max_epi16
(
beta_max
,
b7
);
beta_ptr
[
0
]
=
_mm256_max_epi16
(
m_b0
,
new0
);
beta_ptr
[
1
]
=
_mm256_max_epi16
(
m_b1
,
new1
);
beta_ptr
[
2
]
=
_mm256_max_epi16
(
m_b2
,
new2
);
beta_ptr
[
3
]
=
_mm256_max_epi16
(
m_b3
,
new3
);
beta_ptr
[
4
]
=
_mm256_max_epi16
(
m_b4
,
new4
);
beta_ptr
[
5
]
=
_mm256_max_epi16
(
m_b5
,
new5
);
beta_ptr
[
6
]
=
_mm256_max_epi16
(
m_b6
,
new6
);
beta_ptr
[
7
]
=
_mm256_max_epi16
(
m_b7
,
new7
);
beta_max
=
_mm256_max_epi16
(
beta_ptr
[
0
],
beta_ptr
[
1
]);
beta_max
=
_mm256_max_epi16
(
beta_max
,
beta_ptr
[
2
]);
beta_max
=
_mm256_max_epi16
(
beta_max
,
beta_ptr
[
3
]);
beta_max
=
_mm256_max_epi16
(
beta_max
,
beta_ptr
[
4
]);
beta_max
=
_mm256_max_epi16
(
beta_max
,
beta_ptr
[
5
]);
beta_max
=
_mm256_max_epi16
(
beta_max
,
beta_ptr
[
6
]);
beta_max
=
_mm256_max_epi16
(
beta_max
,
beta_ptr
[
7
]);
beta_ptr
[
0
]
=
_mm256_subs_epi16
(
beta_ptr
[
0
],
beta_max
);
beta_ptr
[
1
]
=
_mm256_subs_epi16
(
beta_ptr
[
1
],
beta_max
);
beta_ptr
[
2
]
=
_mm256_subs_epi16
(
beta_ptr
[
2
],
beta_max
);
beta_ptr
[
3
]
=
_mm256_subs_epi16
(
beta_ptr
[
3
],
beta_max
);
beta_ptr
[
4
]
=
_mm256_subs_epi16
(
beta_ptr
[
4
],
beta_max
);
beta_ptr
[
5
]
=
_mm256_subs_epi16
(
beta_ptr
[
5
],
beta_max
);
beta_ptr
[
6
]
=
_mm256_subs_epi16
(
beta_ptr
[
6
],
beta_max
);
beta_ptr
[
7
]
=
_mm256_subs_epi16
(
beta_ptr
[
7
],
beta_max
);
beta_ptr
-=
8
;
m11p
--
;
m10p
--
;
beta_ptr
[
0
]
=
_mm256_subs_epi16
(
b0
,
beta_max
);
beta_ptr
[
1
]
=
_mm256_subs_epi16
(
b1
,
beta_max
);
beta_ptr
[
2
]
=
_mm256_subs_epi16
(
b2
,
beta_max
);
beta_ptr
[
3
]
=
_mm256_subs_epi16
(
b3
,
beta_max
);
beta_ptr
[
4
]
=
_mm256_subs_epi16
(
b4
,
beta_max
);
beta_ptr
[
5
]
=
_mm256_subs_epi16
(
b5
,
beta_max
);
beta_ptr
[
6
]
=
_mm256_subs_epi16
(
b6
,
beta_max
);
beta_ptr
[
7
]
=
_mm256_subs_epi16
(
b7
,
beta_max
);
#ifdef DEBUG_LOGMAP
fprintf
(
fdavx2
,
"Loop index %d, mb
\n
"
,
k
);
...
...
@@ -658,6 +686,8 @@ void compute_beta16avx2(llr_t* alpha,llr_t* beta,llr_t *m_11,llr_t* m_10,uint16_
#endif
}
timeout
=
rdtsc_oai
();
printf
(
"beta: inner loop time %llu
\n
"
,
timeout
-
timein
);
if
(
rerun_flag
==
1
)
break
;
...
...
@@ -968,7 +998,7 @@ unsigned char phy_threegpplte_turbo_decoder16avx2(int16_t *y,
yp2
=
yparity2
;
#if 0
for (i=0; i<n; i+=8) {
pi2_p = &pi2tab16avx2[iind][i];
...
...
@@ -1084,9 +1114,23 @@ unsigned char phy_threegpplte_turbo_decoder16avx2(int16_t *y,
yp128_cw2+=3;
}
yp=(llr_t*)yp128;
yp_cw2=(llr_t*)yp128_cw2;
#else
pi2_p
=
&
pi2tab16avx2
[
iind
][
0
];
for
(
i
=
0
,
j
=
0
;
i
<
n
;
i
++
)
{
s
[
*
pi2_p
]
=
y
[
j
];
s
[
*
pi2_p
+
8
]
=
y2
[
j
++
];
yp1
[
*
pi2_p
]
=
y
[
j
];
yp1
[
*
pi2_p
+
8
]
=
y2
[
j
++
];
yp2
[
*
pi2_p
]
=
y
[
j
];
yp2
[(
*
pi2_p
++
)
+
8
]
=
y2
[
j
++
];
}
yp
=
(
llr_t
*
)
&
y
[
j
];
yp_cw2
=
(
llr_t
*
)
&
y2
[
j
];
#endif
// Termination
for
(
i
=
0
;
i
<
3
;
i
++
)
{
...
...
openair1/PHY/CODING/3gpplte_turbo_decoder_sse_16bit.c
View file @
015187e5
...
...
@@ -144,12 +144,25 @@ void compute_gamma16(llr_t* m11,llr_t* m10,llr_t* systematic,channel_t* y_parity
fprintf
(
fdsse4
,
"compute_gamma (sse_16bit), %p,%p,%p,%p,framelength %d
\n
"
,
m11
,
m10
,
systematic
,
y_parity
,
frame_length
);
#endif
#ifndef __AVX2__
K1
=
frame_length
>>
3
;
#else
if
((
frame_length
&
15
)
>
0
)
K1
=
(
frame_length
+
1
)
>>
4
;
else
K1
=
frame_length
>>
4
;
#endif
for
(
k
=
0
;
k
<
K1
;
k
++
)
{
#if defined(__x86_64__) || defined(__i386__)
#ifndef __AVX2__
m11_128
[
k
]
=
_mm_srai_epi16
(
_mm_adds_epi16
(
systematic128
[
k
],
y_parity128
[
k
]),
1
);
m10_128
[
k
]
=
_mm_srai_epi16
(
_mm_subs_epi16
(
systematic128
[
k
],
y_parity128
[
k
]),
1
);
#else
((
__m256i
*
)
m11_128
)[
k
]
=
_mm256_srai_epi16
(
_mm256_adds_epi16
(((
__m256i
*
)
systematic128
)[
k
],((
__m256i
*
)
y_parity128
)[
k
]),
1
);
// ((__m256i*)m10_128)[k] = _mm256_srai_epi16(_mm256_subs_epi16(((__m256i*)y_parity128)[k],((__m256i*)systematic128)[k]),1);
((
__m256i
*
)
m10_128
)[
k
]
=
_mm256_srai_epi16
(
_mm256_subs_epi16
(((
__m256i
*
)
systematic128
)[
k
],((
__m256i
*
)
y_parity128
)[
k
]),
1
);
#endif
#elif defined(__arm__)
m11_128
[
k
]
=
vhaddq_s16
(
systematic128
[
k
],
y_parity128
[
k
]);
m10_128
[
k
]
=
vhsubq_s16
(
systematic128
[
k
],
y_parity128
[
k
]);
...
...
@@ -164,13 +177,19 @@ void compute_gamma16(llr_t* m11,llr_t* m10,llr_t* systematic,channel_t* y_parity
#endif
}
k
=
frame_length
>>
3
;
// Termination
#if defined(__x86_64__) || defined(__i386__)
m11_128
[
k
]
=
_mm_srai_epi16
(
_mm_adds_epi16
(
systematic128
[
k
+
term_flag
],
y_parity128
[
k
]),
1
);
//#ifndef __AVX2__
#if 1
m10_128
[
k
]
=
_mm_srai_epi16
(
_mm_subs_epi16
(
systematic128
[
k
+
term_flag
],
y_parity128
[
k
]),
1
);
#else
m10_128
[
k
]
=
_mm_srai_epi16
(
_mm_subs_epi16
(
y_parity128
[
k
],
systematic128
[
k
+
term_flag
]),
1
);
#endif
#elif defined(__arm__)
m11_128
[
k
]
=
vhaddq_s16
(
systematic128
[
k
+
term_flag
],
y_parity128
[
k
]);
m10_128
[
k
]
=
vhsubq_s16
(
systematic128
[
k
+
term_flag
],
y_parity128
[
k
]);
m10_128
[
k
]
=
vhsubq_s16
(
systematic128
[
k
+
term_flag
],
y_parity128
[
k
]);
#endif
#ifdef DEBUG_LOGMAP
...
...
@@ -188,11 +207,21 @@ void compute_alpha16(llr_t* alpha,llr_t* beta,llr_t* m_11,llr_t* m_10,unsigned s
{
int
k
,
l
,
l2
,
K1
,
rerun_flag
=
0
;
#if defined(__x86_64__) || defined(__i386__)
__m128i
*
alpha128
=
(
__m128i
*
)
alpha
,
*
alpha_ptr
;
__m128i
a0
,
a1
,
a2
,
a3
,
a4
,
a5
,
a6
,
a7
,
*
m11p
,
*
m10p
;
__m128i
*
alpha128
=
(
__m128i
*
)
alpha
,
*
alpha_ptr
,
*
m11p
,
*
m10p
;
//#ifndef __AVX2__
#if 1
__m128i
a0
,
a1
,
a2
,
a3
,
a4
,
a5
,
a6
,
a7
;
__m128i
m_b0
,
m_b1
,
m_b2
,
m_b3
,
m_b4
,
m_b5
,
m_b6
,
m_b7
;
__m128i
new0
,
new1
,
new2
,
new3
,
new4
,
new5
,
new6
,
new7
;
__m128i
alpha_max
;
#else
__m256i
*
alpha256
=
(
__m256i
*
)
alpha
,
*
alpha_ptr256
,
m11
,
m10
;
__m256i
a01
,
a23
,
a45
,
a67
,
a02
,
a13
,
a64
,
a75
;
__m256i
m_b01
,
m_b23
,
m_b45
,
m_b67
,
new01
,
new23
,
new45
,
new67
;
__m256i
m11m10_256
;
__m256i
alpha_max
;
#endif
#elif defined(__arm__)
int16x8_t
*
alpha128
=
(
int16x8_t
*
)
alpha
,
*
alpha_ptr
;
int16x8_t
a0
,
a1
,
a2
,
a3
,
a4
,
a5
,
a6
,
a7
,
*
m11p
,
*
m10p
;
...
...
@@ -208,6 +237,10 @@ void compute_alpha16(llr_t* alpha,llr_t* beta,llr_t* m_11,llr_t* m_10,unsigned s
for
(
l
=
K1
;;
l
=
l2
,
rerun_flag
=
1
)
{
#if defined(__x86_64__) || defined(__i386__)
alpha128
=
(
__m128i
*
)
alpha
;
//#ifdef __AVX2__
#if 0
alpha256 = (__m256i *)alpha;
#endif
#elif defined(__arm__)
alpha128
=
(
int16x8_t
*
)
alpha
;
#endif
...
...
@@ -288,6 +321,11 @@ void compute_alpha16(llr_t* alpha,llr_t* beta,llr_t* m_11,llr_t* m_10,unsigned s
}
alpha_ptr
=
&
alpha128
[
0
];
//#ifdef __AVX2__
#if 0
alpha_ptr256 = &alpha256[0];
#endif
#if defined(__x86_64__) || defined(__i386__)
m11p
=
(
__m128i
*
)
m_11
;
m10p
=
(
__m128i
*
)
m_10
;
...
...
@@ -300,6 +338,8 @@ void compute_alpha16(llr_t* alpha,llr_t* beta,llr_t* m_11,llr_t* m_10,unsigned s
k
++
)
{
#if defined(__x86_64__) || defined(__i386__)
//#ifndef __AVX2__
#if 1
a1
=
_mm_load_si128
(
&
alpha_ptr
[
1
]);
a3
=
_mm_load_si128
(
&
alpha_ptr
[
3
]);
a5
=
_mm_load_si128
(
&
alpha_ptr
[
5
]);
...
...
@@ -344,6 +384,37 @@ void compute_alpha16(llr_t* alpha,llr_t* beta,llr_t* m_11,llr_t* m_10,unsigned s
alpha_max
=
_mm_max_epi16
(
alpha_max
,
a5
);
alpha_max
=
_mm_max_epi16
(
alpha_max
,
a6
);
alpha_max
=
_mm_max_epi16
(
alpha_max
,
a7
);
#else
a02
=
_mm256_load_si256
(
&
alpha_ptr256
[
0
]);
a13
=
_mm256_load_si256
(
&
alpha_ptr256
[
1
]);
a64
=
_mm256_load_si256
(
&
alpha_ptr256
[
2
]);
a75
=
_mm256_load_si256
(
&
alpha_ptr256
[
3
]);
m11m10_256
=
_mm256_insertf128_si256
(
m11m10_256
,
*
m11p
,
0
);
m11m10_256
=
_mm256_insertf128_si256
(
m11m10_256
,
*
m10p
,
1
);
m_b01
=
_mm256_adds_epi16
(
a13
,
m11m10_256
);
//negative m10
m_b23
=
_mm256_subs_epi16
(
a75
,
m11m10_256
);
//negative m10
m_b45
=
_mm256_subs_epi16
(
a13
,
m11m10_256
);
//negative m10
m_b67
=
_mm256_adds_epi16
(
a75
,
m11m10_256
);
//negative m10
new01
=
_mm256_subs_epi16
(
a02
,
m11m10_256
);
//negative m10
new23
=
_mm256_adds_epi16
(
a64
,
m11m10_256
);
//negative m10
new45
=
_mm256_adds_epi16
(
a02
,
m11m10_256
);
//negative m10
new67
=
_mm256_subs_epi16
(
a64
,
m11m10_256
);
//negative m10
a01
=
_mm256_max_epi16
(
m_b01
,
new01
);
a23
=
_mm256_max_epi16
(
m_b23
,
new23
);
a45
=
_mm256_max_epi16
(
m_b45
,
new45
);
a67
=
_mm256_max_epi16
(
m_b67
,
new67
);
alpha_max
=
_mm256_max_epi16
(
a01
,
a23
);
alpha_max
=
_mm256_max_epi16
(
alpha_max
,
a45
);
alpha_max
=
_mm256_max_epi16
(
alpha_max
,
a67
);
alpha_max
=
_mm256_max_epi16
(
alpha_max
,
_mm256_permutevar8x32_epi32
(
alpha_max
,
_mm256_set_epi32
(
3
,
2
,
1
,
0
,
7
,
6
,
5
,
4
)));
#endif
#elif defined(__arm__)
m_b0
=
vqaddq_s16
(
alpha_ptr
[
1
],
*
m11p
);
// m11
m_b4
=
vqsubq_s16
(
alpha_ptr
[
1
],
*
m11p
);
// m00=-m11
...
...
@@ -383,9 +454,15 @@ void compute_alpha16(llr_t* alpha,llr_t* beta,llr_t* m_11,llr_t* m_10,unsigned s
#endif
alpha_ptr
+=
8
;
//#ifdef __AVX2__
#if 0
alpha_ptr256+=4;
#endif
m11p
++
;
m10p
++
;
#if defined(__x86_64__) || defined(__i386__)
//#ifndef __AVX2__
#if 1
alpha_ptr
[
0
]
=
_mm_subs_epi16
(
a0
,
alpha_max
);
alpha_ptr
[
1
]
=
_mm_subs_epi16
(
a1
,
alpha_max
);
alpha_ptr
[
2
]
=
_mm_subs_epi16
(
a2
,
alpha_max
);
...
...
@@ -394,6 +471,18 @@ void compute_alpha16(llr_t* alpha,llr_t* beta,llr_t* m_11,llr_t* m_10,unsigned s
alpha_ptr
[
5
]
=
_mm_subs_epi16
(
a5
,
alpha_max
);
alpha_ptr
[
6
]
=
_mm_subs_epi16
(
a6
,
alpha_max
);
alpha_ptr
[
7
]
=
_mm_subs_epi16
(
a7
,
alpha_max
);
#else
a01
=
_mm256_subs_epi16
(
a01
,
alpha_max
);
a23
=
_mm256_subs_epi16
(
a23
,
alpha_max
);
a45
=
_mm256_subs_epi16
(
a45
,
alpha_max
);
a67
=
_mm256_subs_epi16
(
a67
,
alpha_max
);
alpha_ptr256
[
0
]
=
_mm256_permute2x128_si256
(
a01
,
a23
,
0x20
);
//a02
alpha_ptr256
[
1
]
=
_mm256_permute2x128_si256
(
a01
,
a23
,
0x13
);
//a13
alpha_ptr256
[
2
]
=
_mm256_permute2x128_si256
(
a45
,
a67
,
0x02
);
//a64
alpha_ptr256
[
3
]
=
_mm256_permute2x128_si256
(
a45
,
a67
,
0x31
);
//a75
#endif
#elif defined(__arm__)
alpha_ptr
[
0
]
=
vqsubq_s16
(
a0
,
alpha_max
);
alpha_ptr
[
1
]
=
vqsubq_s16
(
a1
,
alpha_max
);
...
...
@@ -488,8 +577,12 @@ void compute_beta16(llr_t* alpha,llr_t* beta,llr_t *m_11,llr_t* m_10,unsigned sh
// fprintf(fdsse4,"beta init: offset8 %d\n",offset8_flag);
m11
=
(
int16_t
)
m_11
[
2
+
frame_length
];
//#ifndef __AVX2__
#if 1
m10
=
(
int16_t
)
m_10
[
2
+
frame_length
];
#else
m10
=-
(
int16_t
)
m_10
[
2
+
frame_length
];
#endif
#ifdef DEBUG_LOGMAP
fprintf
(
fdsse4
,
"m11,m10 %d,%d
\n
"
,
m11
,
m10
);
#endif
...
...
@@ -643,6 +736,9 @@ void compute_beta16(llr_t* alpha,llr_t* beta,llr_t *m_11,llr_t* m_10,unsigned sh
m11_128
=
((
__m128i
*
)
m_11
)[
k
];
m10_128
=
((
__m128i
*
)
m_10
)[
k
];
//#ifndef __AVX2__
#if 1
m_b0
=
_mm_adds_epi16
(
beta_ptr
[
4
],
m11_128
);
//m11
m_b1
=
_mm_subs_epi16
(
beta_ptr
[
4
],
m11_128
);
//m00
m_b2
=
_mm_subs_epi16
(
beta_ptr
[
5
],
m10_128
);
//m01
...
...
@@ -652,6 +748,7 @@ void compute_beta16(llr_t* alpha,llr_t* beta,llr_t *m_11,llr_t* m_10,unsigned sh
m_b6
=
_mm_subs_epi16
(
beta_ptr
[
7
],
m11_128
);
//m00
m_b7
=
_mm_adds_epi16
(
beta_ptr
[
7
],
m11_128
);
//m11
new0
=
_mm_subs_epi16
(
beta_ptr
[
0
],
m11_128
);
//m00
new1
=
_mm_adds_epi16
(
beta_ptr
[
0
],
m11_128
);
//m11
new2
=
_mm_adds_epi16
(
beta_ptr
[
1
],
m10_128
);
//m10
...
...
@@ -661,8 +758,29 @@ void compute_beta16(llr_t* alpha,llr_t* beta,llr_t *m_11,llr_t* m_10,unsigned sh
new6
=
_mm_adds_epi16
(
beta_ptr
[
3
],
m11_128
);
//m11
new7
=
_mm_subs_epi16
(
beta_ptr
[
3
],
m11_128
);
//m00
#else
b01
=
_mm256_load_si256
(
&
((
_m256i
*
)
beta_ptr
)[
0
]);
b23
=
_mm256_load_si256
(
&
((
_m256i
*
)
beta_ptr
)[
1
]);
b45
=
_mm256_load_si256
(
&
((
_m256i
*
)
beta_ptr
)[
2
]);
b67
=
_mm256_load_si256
(
&
((
_m256i
*
)
beta_ptr
)[
3
]);
m11m10_256
=
_mm256_insertf128_si256
(
m11m10_256
,
m11_128
,
0
);
m11m10_256
=
_mm256_insertf128_si256
(
m11m10_256
,
m10_128
,
1
);
m_b02
=
_mm256_adds_epi16
(
b45
,
m11m10_256
);
//negative m10
m_b13
=
_mm256_subs_epi16
(
b45
,
m11m10_256
);
//negative m10
m_b64
=
_mm256_subs_epi16
(
b67
,
m11m10_256
);
//negative m10
m_b75
=
_mm256_adds_epi16
(
b67
,
m11m10_256
);
//negative m10
new02
=
_mm256_subs_epi16
(
b01
,
m11m10_256
);
//negative m10
new13
=
_mm256_adds_epi16
(
b01
,
m11m10_256
);
//negative m10
new64
=
_mm256_adds_epi16
(
b23
,
m11m10_256
);
//negative m10
new75
=
_mm256_subs_epi16
(
b24
,
m11m10_256
);
//negative m10
#endif
beta_ptr
-=
8
;
//#ifndef __AVX2__
#if 1
beta_ptr
[
0
]
=
_mm_max_epi16
(
m_b0
,
new0
);
beta_ptr
[
1
]
=
_mm_max_epi16
(
m_b1
,
new1
);
beta_ptr
[
2
]
=
_mm_max_epi16
(
m_b2
,
new2
);
...
...
@@ -688,6 +806,28 @@ void compute_beta16(llr_t* alpha,llr_t* beta,llr_t *m_11,llr_t* m_10,unsigned sh
beta_ptr
[
5
]
=
_mm_subs_epi16
(
beta_ptr
[
5
],
beta_max
);
beta_ptr
[
6
]
=
_mm_subs_epi16
(
beta_ptr
[
6
],
beta_max
);
beta_ptr
[
7
]
=
_mm_subs_epi16
(
beta_ptr
[
7
],
beta_max
);
#else
b02
=
_mm256_max_epi16
(
m_b02
,
new02
);
b13
=
_mm256_max_epi16
(
m_b13
,
new13
);
b64
=
_mm256_max_epi16
(
m_b64
,
new64
);
b75
=
_mm256_max_epi16
(
m_b75
,
new75
);
beta_max
=
_mm256_max_epi16
(
b02
,
b13
);
beta_max
=
_mm256_max_epi16
(
beta_max
,
b64
);
beta_max
=
_mm256_max_epi16
(
beta_max
,
b75
);
beta_max
=
_mm256_max_epi16
(
beta_max
,
_mm256_permutevar8x32_epi32
(
betaa_max
,
_mm256_set_epi32
(
3
,
2
,
1
,
0
,
7
,
6
,
5
,
4
)));
b02
=
_mm256_subs_epi16
(
b02
,
beta_max
);
b13
=
_mm256_subs_epi16
(
b13
,
beta_max
);
b64
=
_mm256_subs_epi16
(
b64
,
beta_max
);
b75
=
_mm256_subs_epi16
(
b75
,
beta_max
);
((
_m256i
*
)
beta_ptr
)[
0
])
=
_mm256_permute2x128_si256
(
b02
,
b13
,
0x02
);
//b01
((
_m256i
*
)
beta_ptr
)[
1
])
=
_mm256_permute2x128_si256
(
b02
,
b13
,
0x31
);
//b23
((
_m256i
*
)
beta_ptr
)[
2
])
=
_mm256_permute2x128_si256
(
b64
,
b75
,
0x13
);
//b45
((
_m256i
*
)
beta_ptr
)[
3
])
=
_mm256_permute2x128_si256
(
b64
,
b75
,
0x20
);
//b67
#endif
#elif defined(__arm__)
m11_128
=
((
int16x8_t
*
)
m_11
)[
k
];
m10_128
=
((
int16x8_t
*
)
m_10
)[
k
];
...
...
@@ -820,6 +960,9 @@ void compute_ext16(llr_t* alpha,llr_t* beta,llr_t* m_11,llr_t* m_10,llr_t* ext,
print_shorts("b6:",&beta_ptr[6]);
print_shorts("b7:",&beta_ptr[7]);
*/
//#ifndef __AVX2__
#if 1
m00_4
=
_mm_adds_epi16
(
alpha_ptr
[
7
],
beta_ptr
[
3
]);
//ALPHA_BETA_4m00;
m11_4
=
_mm_adds_epi16
(
alpha_ptr
[
7
],
beta_ptr
[
7
]);
//ALPHA_BETA_4m11;
m00_3
=
_mm_adds_epi16
(
alpha_ptr
[
6
],
beta_ptr
[
7
]);
//ALPHA_BETA_3m00;
...
...
@@ -836,6 +979,32 @@ void compute_ext16(llr_t* alpha,llr_t* beta,llr_t* m_11,llr_t* m_10,llr_t* ext,
m10_2
=
_mm_adds_epi16
(
alpha_ptr
[
3
],
beta_ptr
[
5
]);
//ALPHA_BETA_2m10;
m10_1
=
_mm_adds_epi16
(
alpha_ptr
[
2
],
beta_ptr
[
1
]);
//ALPHA_BETA_1m10;
m01_1
=
_mm_adds_epi16
(
alpha_ptr
[
2
],
beta_ptr
[
5
]);
//ALPHA_BETA_1m01;
#else
m00_1
=
_mm_adds_epi16
(
alpha_ptr
[
0
],
beta_ptr
[
0
]);
//ALPHA_BETA_1m00;
m10_1
=
_mm_adds_epi16
(
alpha_ptr
[
2
],
beta_ptr
[
1
]);
//ALPHA_BETA_1m10;
m11_1
=
_mm_adds_epi16
(
alpha_ptr
[
0
],
beta_ptr
[
4
]);
//ALPHA_BETA_1m11;
m01_1
=
_mm_adds_epi16
(
alpha_ptr
[
2
],
beta_ptr
[
5
]);
//ALPHA_BETA_1m01;
m11_2
=
_mm_adds_epi16
(
alpha_ptr
[
1
],
beta_ptr
[
0
]);
//ALPHA_BETA_2m11;
m01_2
=
_mm_adds_epi16
(
alpha_ptr
[
3
],
beta_ptr
[
1
]);
//ALPHA_BETA_2m01;
m00_2
=
_mm_adds_epi16
(
alpha_ptr
[
1
],
beta_ptr
[
4
]);
//ALPHA_BETA_2m00;
m10_2
=
_mm_adds_epi16
(
alpha_ptr
[
3
],
beta_ptr
[
5
]);
//ALPHA_BETA_2m10;
m11_3
=
_mm_adds_epi16
(
alpha_ptr
[
6
],
beta_ptr
[
3
]);
//ALPHA_BETA_3m11;
m01_3
=
_mm_adds_epi16
(
alpha_ptr
[
4
],
beta_ptr
[
2
]);
//ALPHA_BETA_3m01;
m00_3
=
_mm_adds_epi16
(
alpha_ptr
[
6
],
beta_ptr
[
7
]);
//ALPHA_BETA_3m00;
m10_3
=
_mm_adds_epi16
(
alpha_ptr
[
4
],
beta_ptr
[
6
]);
//ALPHA_BETA_3m10;
m00_4
=
_mm_adds_epi16
(
alpha_ptr
[
7
],
beta_ptr
[
3
]);
//ALPHA_BETA_4m00;
m10_4
=
_mm_adds_epi16
(
alpha_ptr
[
5
],
beta_ptr
[
2
]);
//ALPHA_BETA_4m10;
m11_4
=
_mm_adds_epi16
(
alpha_ptr
[
7
],
beta_ptr
[
7
]);
//ALPHA_BETA_4m11;
m01_4
=
_mm_adds_epi16
(
alpha_ptr
[
5
],
beta_ptr
[
6
]);
//ALPHA_BETA_4m01;
#endif
/*
print_shorts("m11_1:",&m11_1);
print_shorts("m11_2:",&m11_2);
...
...
@@ -1030,19 +1199,19 @@ unsigned char phy_threegpplte_turbo_decoder16(short *y,
n is the size in bits of the coded block, with the tail */
llr_t
systematic0
[
n
+
16
]
__attribute__
((
aligned
(
16
)));
llr_t
systematic1
[
n
+
16
]
__attribute__
((
aligned
(
16
)));
llr_t
systematic2
[
n
+
16
]
__attribute__
((
aligned
(
16
)));
llr_t
yparity1
[
n
+
16
]
__attribute__
((
aligned
(
16
)));
llr_t
yparity2
[
n
+
16
]
__attribute__
((
aligned
(
16
)));
llr_t
systematic0
[
n
+
16
]
__attribute__
((
aligned
(
32
)));
llr_t
systematic1
[
n
+
16
]
__attribute__
((
aligned
(
32
)));
llr_t
systematic2
[
n
+
16
]
__attribute__
((
aligned
(
32
)));
llr_t
yparity1
[
n
+
16
]
__attribute__
((
aligned
(
32
)));
llr_t
yparity2
[
n
+
16
]
__attribute__
((
aligned
(
32
)));
llr_t
ext
[
n
+
128
]
__attribute__
((
aligned
(
16
)));
llr_t
ext2
[
n
+
128
]
__attribute__
((
aligned
(
16
)));
llr_t
ext
[
n
+
128
]
__attribute__
((
aligned
(
32
)));
llr_t
ext2
[
n
+
128
]
__attribute__
((
aligned
(
32
)));
llr_t
alpha
[(
n
+
16
)
*
8
]
__attribute__
((
aligned
(
16
)));
llr_t
beta
[(
n
+
16
)
*
8
]
__attribute__
((
aligned
(
16
)));
llr_t
m11
[
n
+
16
]
__attribute__
((
aligned
(
16
)));
llr_t
m10
[
n
+
16
]
__attribute__
((
aligned
(
16
)));
llr_t
alpha
[(
n
+
16
)
*
8
]
__attribute__
((
aligned
(
32
)));
llr_t
beta
[(
n
+
16
)
*
8
]
__attribute__
((
aligned
(
32
)));
llr_t
m11
[
n
+
32
]
__attribute__
((
aligned
(
32
)));
llr_t
m10
[
n
+
32
]
__attribute__
((
aligned
(
32
)));
int
*
pi2_p
,
*
pi4_p
,
*
pi5_p
,
*
pi6_p
;
...
...
openair1/PHY/LTE_REFSIG/lte_gold.c
View file @
015187e5
...
...
@@ -64,18 +64,15 @@ void lte_gold(LTE_DL_FRAME_PARMS *frame_parms,uint32_t lte_gold_table[20][2][14]
(((
1
+
(
Nid_cell
<<
1
))
*
(
1
+
(((
frame_parms
->
Ncp
==
0
)
?
4
:
3
)
*
l
)
+
(
7
*
(
1
+
ns
))))
<<
10
);
//cinit
//x2 = frame_parms->Ncp + (Nid_cell<<1) + (1+(Nid_cell<<1))*(1 + (3*l) + (7*(1+ns))); //cinit
//n = 0
// printf("cinit (ns %d, l %d) => %d\n",ns,l,x2);
x1
=
1
+
(
1
<<
31
);
x2
=
x2
^
((
x2
^
(
x2
>>
1
)
^
(
x2
>>
2
)
^
(
x2
>>
3
))
<<
31
);
// skip first 50 double words (1600 bits)
//printf("n=0 : x1 %x, x2 %x\n",x1,x2);
for
(
n
=
1
;
n
<
50
;
n
++
)
{
x1
=
(
x1
>>
1
)
^
(
x1
>>
4
);
x1
=
x1
^
(
x1
<<
31
)
^
(
x1
<<
28
);
x2
=
(
x2
>>
1
)
^
(
x2
>>
2
)
^
(
x2
>>
3
)
^
(
x2
>>
4
);
x2
=
x2
^
(
x2
<<
31
)
^
(
x2
<<
30
)
^
(
x2
<<
29
)
^
(
x2
<<
28
);
// printf("x1 : %x, x2 : %x\n",x1,x2);
}
for
(
n
=
0
;
n
<
14
;
n
++
)
{
...
...
@@ -84,7 +81,6 @@ void lte_gold(LTE_DL_FRAME_PARMS *frame_parms,uint32_t lte_gold_table[20][2][14]
x2
=
(
x2
>>
1
)
^
(
x2
>>
2
)
^
(
x2
>>
3
)
^
(
x2
>>
4
);
x2
=
x2
^
(
x2
<<
31
)
^
(
x2
<<
30
)
^
(
x2
<<
29
)
^
(
x2
<<
28
);
lte_gold_table
[
ns
][
l
][
n
]
=
x1
^
x2
;
// printf("n=%d : c %x\n",n,x1^x2);
}
}
...
...
openair1/PHY/LTE_TRANSPORT/dlsch_decoding.c
View file @
015187e5
...
...
@@ -446,7 +446,8 @@ uint32_t dlsch_decoding(PHY_VARS_UE *phy_vars_ue,
printf("\n");
*/
#ifndef __AVX2__
//#ifndef __AVX2__
#if 1
if
(
err_flag
==
0
)
{
start_meas
(
dlsch_turbo_decoding_stats
);
...
...
openair1/PHY/LTE_TRANSPORT/dlsch_demodulation.c
View file @
015187e5
...
...
@@ -1898,17 +1898,17 @@ void dlsch_channel_compensation_TM3(LTE_DL_FRAME_PARMS *frame_parms,
for
(
aarx
=
0
;
aarx
<
frame_parms
->
nb_antennas_rx
;
aarx
++
)
{
dl_ch0_128
=
(
__m128i
*
)
&
dl_ch_estimates_ext
[
aarx
][
symbol
*
frame_parms
->
N_RB_DL
*
12
];
dl_ch1_128
=
(
__m128i
*
)
&
dl_ch_estimates_ext
[
2
+
aarx
][
symbol
*
frame_parms
->
N_RB_DL
*
12
];
dl_ch0_128
=
(
__m128i
*
)
&
dl_ch_estimates_ext
[
aarx
][
symbol
*
frame_parms
->
N_RB_DL
*
12
];
// hr,0
dl_ch1_128
=
(
__m128i
*
)
&
dl_ch_estimates_ext
[
2
+
aarx
][
symbol
*
frame_parms
->
N_RB_DL
*
12
];
// hr,1
dl_ch_mag0_128
=
(
__m128i
*
)
&
dl_ch_mag0
[
aarx
][
symbol
*
frame_parms
->
N_RB_DL
*
12
];
dl_ch_mag0_128b
=
(
__m128i
*
)
&
dl_ch_magb0
[
aarx
][
symbol
*
frame_parms
->
N_RB_DL
*
12
];
dl_ch_mag1_128
=
(
__m128i
*
)
&
dl_ch_mag1
[
aarx
][
symbol
*
frame_parms
->
N_RB_DL
*
12
];
dl_ch_mag1_128b
=
(
__m128i
*
)
&
dl_ch_magb1
[
aarx
][
symbol
*
frame_parms
->
N_RB_DL
*
12
];
rxdataF128
=
(
__m128i
*
)
&
rxdataF_ext
[
aarx
][
symbol
*
frame_parms
->
N_RB_DL
*
12
];
rxdataF_comp0_128
=
(
__m128i
*
)
&
rxdataF_comp0
[
aarx
][
symbol
*
frame_parms
->
N_RB_DL
*
12
];
rxdataF_comp1_128
=
(
__m128i
*
)
&
rxdataF_comp1
[
aarx
][
symbol
*
frame_parms
->
N_RB_DL
*
12
];
rxdataF128
=
(
__m128i
*
)
&
rxdataF_ext
[
aarx
][
symbol
*
frame_parms
->
N_RB_DL
*
12
];
// yr
rxdataF_comp0_128
=
(
__m128i
*
)
&
rxdataF_comp0
[
aarx
][
symbol
*
frame_parms
->
N_RB_DL
*
12
];
// yr,0 = yr * conj(hr,0)
rxdataF_comp1_128
=
(
__m128i
*
)
&
rxdataF_comp1
[
aarx
][
symbol
*
frame_parms
->
N_RB_DL
*
12
];
// yr,1 = yr * conj(hr,1)
for
(
rb
=
0
;
rb
<
nb_rb
;
rb
++
)
{
...
...
Write
Preview
Markdown
is supported
0%
Try again
or
attach a new file
Attach a file
Cancel
You are about to add
0
people
to the discussion. Proceed with caution.
Finish editing this message first!
Cancel
Please
register
or
sign in
to comment