Skip to content
Projects
Groups
Snippets
Help
Loading...
Help
Support
Keyboard shortcuts
?
Submit feedback
Contribute to GitLab
Sign in / Register
Toggle navigation
O
OpenXG-RAN
Project overview
Project overview
Details
Activity
Releases
Repository
Repository
Files
Commits
Branches
Tags
Contributors
Graph
Compare
Issues
0
Issues
0
List
Boards
Labels
Milestones
Merge Requests
0
Merge Requests
0
CI / CD
CI / CD
Pipelines
Jobs
Schedules
Analytics
Analytics
CI / CD
Repository
Value Stream
Wiki
Wiki
Snippets
Snippets
Members
Members
Collapse sidebar
Close sidebar
Activity
Graph
Create a new issue
Jobs
Commits
Issue Boards
Open sidebar
canghaiwuhen
OpenXG-RAN
Commits
76ebc5ad
Commit
76ebc5ad
authored
6 years ago
by
Raymond Knopp
Browse files
Options
Browse Files
Download
Email Patches
Plain Diff
addition of ARM NEON intrinsics
parent
88d5bf42
develop
1
128-ues
256_QAM_demod
375-syrtem-sdr-platform
408-reworked
408-ue-main-threads
445-LDPC-implementation-on-GPU
459-pusch-based-ta-updates
464-ru_beamforming_in_gpu
464-ru_beamforming_in_gpu-CPUsubfunction
472-add-pusch-dmrs-modes
481-ldpc-decoder-on-gpu
5g_fapi_scf
LTE_TRX_on_single_port
NCTU_CS_ISIP
NCTU_CS_ISIP_CPU
NCTU_CS_ISIP_GPU
NCTU_OpinConnect_LDPC
NR-PHY-MAC-IF-multi-UE
NR_10MHz
NR_CSI_reporting
NR_DCI_01
NR_DL_sched_fixes
NR_DL_scheduler
NR_FAPI_beamindex_SSB_RO
NR_FR2_RA
NR_FR2_RRC_SSB
NR_MAC_CE_GlobalEdge
NR_MAC_Multi_Rach_GlobalEdge
NR_MAC_SSB_RO_GlobalEdge
NR_MAC_SSB_RO_UE_IDCC
NR_MAC_SSB_RO_merge
NR_MAC_TCI_UCI_GlobalEdge
NR_NGAP
NR_PDCP_noS1
NR_PUCCH_MultiUE
NR_RA_updates
NR_RRCConfiguragion_FR2
NR_RRCConfiguration
NR_RRCConfiguration_FR2
NR_RRCConfiguration_S1U
NR_RRCConfiguration_merge_develop
NR_RRCConfiguration_sync_source
NR_RRCConfiguration_trx_thread
NR_RRC_CP_bugfix
NR_RRC_PDCP
NR_RRC_PRACH_procedures
NR_RRC_PRACH_procedures_todevelop
NR_RRC_PUSCH
NR_RRC_TA
NR_RRC_X2AP_AMBR_Change_Global_edge
NR_RRC_X2AP_RemoveHardcodings_GlobalEdge
NR_RRC_config_simplified
NR_RRC_harq
NR_RRC_harq_b
NR_RRC_harq_hacks
NR_RRC_harq_newdcipdu
NR_SA_NGAP_RRC
NR_SA_NGAP_RRC_wk42
NR_SA_itti_sim_wk48
NR_SCHED
NR_SRB_Config
NR_TRX_on_single_port
NR_TRX_on_single_port2
NR_UE_MAC_scheduler
NR_UE_RA_fixes
NR_UE_UL_DCI_improvements
NR_UE_enable_parallelization
NR_UE_stability_fixes
NR_UL_FAPI_programming
NR_UL_scheduler
NR_UL_scheduler_rebased
NR_UL_scheduling
NR_beamforming_test
NR_gNB_SCF_Indication
NR_ipaccess_testing
NR_mac_uci_functions_rework
NR_msg2_phytest
NR_scheduling_CSIRS
NR_scheduling_request
NR_test_S1U_RRC_PRACH_procedures
NR_ue_dlsch_dmrs_cdm
OpInConnect_ISIP
PUSCH_TA_update
RA_CI_test
UE_DL_DCI_hotfix
addoptions_nr_USRPdevice
bch-fixes-bitmap
benetel_5g_prach_fix
benetel_phase_rotation
benetel_phase_rotation_old
bugfix-minor-remove-wrong-log
bugfix-nr-bands
bugfix-nr-ldpc-post-processing
bugfix-nr-ldpc-size-typo
bugfix-nr-pdcp-sn-size
bugfix-nr-rate-matching-assertion
cce_indexing_fix
cce_indexing_fix2
ci-deploy-docker-compose
ci-rd-july-improvements
ci-ul-iperf-from-trf-container
clean-5G-scope-round2
cleanup_softmodem_main
constant_power
debug_branch_init_sync
develop-ci
develop-nr
develop-nr-adding-2018-09-asn1
develop-nr-fr2
develop-nr-fr2-rework
develop-nr_cppcheck
develop-oriecpriupdates
develop-sib1
develop_inria_ci_deployment
develop_inria_ci_deployment_gp
develop_integration_2020_w15
develop_integration_2020_w19
develop_integration_w08
dfts_alternatives
dlsch-all-dlslots
dlsch_encode_mthread
dlsch_parallel
docupdate_tools
dongzhanyi-zte-develop
dongzhanyi-zte-develop2
dreibh/apt-auth-fix
dreibh/device-load-fix
dreibh/device-load-fix-develop-branch
dual-connectivity
edrx
extend_sharedlibusage
extend_sharedlibusage2
fapi_for_dmrs_and_ptrs
feat-mac-sock
feature-4g-sched
feature-nr-4g-nfapi-modifications
feature-support-clang-format
feature/make-s1-mme-port-configurable
feature/make-s1-mme-port-configurable-with-astyle-fixes
fembms-enb-ue
finalize-oaicn-integration
firas
fix-ci-tun
fix-clock-source
fix-itti-segv
fix-l2-sim
fix-limeSDR-compile
fix-softmodem-restart
fix-warnings
fix_do_ra_data
fix_pdsch_low_prb
fix_rfsim_mimo
fix_rrc_x2_ticking
fixes-mac-sched-nfapi
fixes-mac-sched-tun
fixes-tun
flexran-apps
flexran-improvements
flexran-repair-mme-mgmt
fr2-hw-test
fujitsu_lte_contribution
fujitsu_lte_contribution-128
gNB-nrUE-USRP
generate_push_ptrs
harq-hotfix
hotfix-minor-remove-nr-rlc-cppcheck-error
hotfix-nr-rlc-tick
hotfix-ocp-executable
hotfix-ue-musim-compilation
hotfix_usrp_lib
improve_build_nr_lte_merge
improve_nr_modulation
improve_ue_stability
integration-develop-nr-2019w45
integration_2020_wk40
integration_2020_wk41
integration_2020_wk42_2
integration_2020_wk45
integration_2020_wk45_2
integration_2020_wk46
integration_2020_wk46_2
integration_2020_wk47
integration_2020_wk48
integration_2020_wk48_2
integration_2020_wk49
integration_2020_wk50
integration_2020_wk50_1
inter-RRU-final
inter-RRU-nr
inter-RRU-oairu
inter-rru-UE
interoperability-test
isip_nr
l2-fixes
ldpc-dec-layering
ldpc-decoder-codegen
ldpc-decoder-codegen2
ldpc-decoder-improvements
ldpc-offload
ldpc_short_codeword_fixes
load_gnb
lte_uplink_improvement
mac-fixes-wk45_2
mbms-fix-develop-nr
merging-2019-w51-to-develop-nr
mosaic5g-oai-ran
mosaic5g-oai-sim
new_rlc_2020
nfapi-bugfix
nfapi_nr_develop
ngap-dlul
ngap-support
ngap-w48-merge2
ngap-wf
ngap-wf-1120
ngap-wf-1120-srb
ngap-wf-1120-srb-gtp
ngap-wf-1120-srb-gtp-hs
ngap-wf-1120-srb-gtp-hs1
ngap-wf-1120-srb-gtp-hs2
ngap-wf-1203-yunsdr
ngap-wf-liuyu
ngap_lfq_1120
ngap_merge
noCore
nr-coreset-bug-fix
nr-dlsch-multi-thread
nr-dlsch-thread
nr-dual-connectivity
nr-interdigital-test
nr-ip-uplink-noS1
nr-mac-pdu-wireshark
nr-mac-remove-ue-list
nr-multiple-ssb
nr-pdcp
nr-pdsch-extraction-bugfix
nr-physim-update
nr-rlc-am-bugfix-w44
nr-rlc-bugfix-w44
nr-ssb-measurements
nr-timing-measurement
nr-timing-measurement-merge
nr-ue-buffer-status
nr-ue-slot-based
nr-uldci
nrUE
nrUE-hs
nrUE-upper-layer
nr_beamforming
nr_bsr
nr_ci_dlsim
nr_csi_newbranch
nr_dci_procedures
nr_demo_wsa2019
nr_dl_dmrs_type2
nr_dl_pf
nr_dl_pf2
nr_dl_ul_ptrs
nr_dlsch_parallel_measurements
nr_dlsim_plot
nr_fapi_for_push_tmp
nr_fdd_if_fix
nr_fix_easycppcheck
nr_flexible_NRBDL
nr_improve_build_procedures
nr_increase_tp
nr_mib_vsa_test
nr_pdcch_testing
nr_pdcch_updates
nr_pdsch_integration
nr_polar_decoder_improvement
nr_prach
nr_prach_fr2
nr_pucch
nr_pucch2
nr_segmentation_fixes
nr_sim_fix
nr_tdd_configuration
nr_ue_msg3
nr_ue_tti_cleanup
nr_vcd
nrue-multi-thread
nrue_msg2_reception
nsa-ue
nsa_remove_band_hardcodings
oai-sim
oai-ubuntu-docker
oai-ubuntu-docker-for-lmssdr
oairu
oc-docker-october-improvements
openxg/develop
pdcp-benchmark
pdsch-ch-est
polar8
ptrs_rrc_config
pusch-mthread-scaling-fix
ra-dl-ul
reduce_memory_footprint
remove-ci-workaround
remove_nos1_hack_pdcp
remove_x2_gnb_hardcoding
repair-TA
revert-f5c94279
revert_memcpy
rh_ci_fix_autoterminate
rh_ci_fr1_update
rh_ci_oc
rh_ci_py
rh_ci_rfsim_ra
rh_doc_update_3
rh_fr1_newjenkins
rh_fr1_update
rh_gnb_compile_fix
rh_wk50_debug
rlc-v2-bugfix-status-reporting
rlc-v2-tick
rlc_v2_coverity_fixes
rrc-enb-phy-testmode
ru-parallel-beamforming
runel
runel-reverse-test
s1_subnormal
s1ap-bugfix-rab_setup
small-bugfixes-w40
smallcleanup
softmodem_cleanup
split73
test-x310-perf
testing_2symb_pdcch
testing_with_external_txdata
thread-pool
tools_5Gadapt
tp-ota-test
trx_thread_param
trx_write_thread
ue-csi
ue-fixes-ota
ue-updates-runel-test
ue_adjust_gain
ue_dlsch-multi-threading
ue_dlsch_decoding_ldpc_offload
ue_nfapi_mch
uhd_priority_set_cleanup
ul_dl_dci_same_slot
ul_harq
ulsch_decode_mthread
ulsim_changes
update-to-2019-march-june-release
usrp_fix_adc_shift_and_pps_sync
usrp_gpio_test
x2-endc-processing
yihongzheng_srb
zzs
No related merge requests found
Changes
1
Hide whitespace changes
Inline
Side-by-side
Showing
1 changed file
with
114 additions
and
30 deletions
+114
-30
openair1/PHY/CODING/nrPolar_tools/nr_polar_decoding_tools.c
openair1/PHY/CODING/nrPolar_tools/nr_polar_decoding_tools.c
+114
-30
No files found.
openair1/PHY/CODING/nrPolar_tools/nr_polar_decoding_tools.c
View file @
76ebc5ad
...
...
@@ -254,6 +254,18 @@ void build_decoder_tree(t_nrPolar_params *pp) {
}
#if defined(__arm__) || defined(__aarch64__)
// translate 1-1 SIMD functions from SSE to NEON
#define __m128i int16x8_t
#define __m64 int8x8_t
#define _mm_abs_epi16(a) vabsq_s16(a)
#define _mm_min_epi16(a,b) vminq_s16(a,b)
#define _mm_subs_epi16(a,b) vsubq_s16(a,b)
#define _mm_abs_pi16(a) vabs_s16(a)
#define _mm_min_pi16(a,b) vmin_s16(a,b)
#define _mm_subs_pi16(a,b) vsub_s16(a,b)
#endif
void
applyFtoleft
(
t_nrPolar_params
*
pp
,
decoder_node_t
*
node
)
{
int16_t
*
alpha_v
=
node
->
alpha
;
int16_t
*
alpha_l
=
node
->
left
->
alpha
;
...
...
@@ -270,7 +282,6 @@ void applyFtoleft(t_nrPolar_params *pp,decoder_node_t *node) {
if
(
node
->
left
->
all_frozen
==
0
)
{
#if defined(__AVX2__)
int
avx2mod
=
(
node
->
Nv
/
2
)
&
15
;
if
(
avx2mod
==
0
)
{
...
...
@@ -284,14 +295,7 @@ void applyFtoleft(t_nrPolar_params *pp,decoder_node_t *node) {
absa256
=
_mm256_abs_epi16
(
a256
);
absb256
=
_mm256_abs_epi16
(
b256
);
minabs256
=
_mm256_min_epi16
(
absa256
,
absb256
);
((
__m256i
*
)
alpha_l
)[
i
]
=
_mm256_sign_epi16
(
minabs256
,
_mm256_xor_si256
(
a256
,
b256
));
/* for (int j=0;j<16;j++) printf("alphal[%d] %d (%d,%d,%d)\n",
(16*i) + j,
alpha_l[(16*i)+j],
((int16_t*)&minabs256)[j],
alpha_v[(16*i)+j],
alpha_v[(16*i)+j+(node->Nv/2)]);
*/
((
__m256i
*
)
alpha_l
)[
i
]
=
_mm256_sign_epi16
(
minabs256
,
_mm256_sign_epi16
(
a256
,
b256
));
}
}
else
if
(
avx2mod
==
8
)
{
...
...
@@ -301,7 +305,7 @@ void applyFtoleft(t_nrPolar_params *pp,decoder_node_t *node) {
absa128
=
_mm_abs_epi16
(
a128
);
absb128
=
_mm_abs_epi16
(
b128
);
minabs128
=
_mm_min_epi16
(
absa128
,
absb128
);
*
((
__m128i
*
)
alpha_l
)
=
_mm_sign_epi16
(
minabs128
,
_mm_
xor_si128
(
a128
,
b128
));
*
((
__m128i
*
)
alpha_l
)
=
_mm_sign_epi16
(
minabs128
,
_mm_
sign_epi16
(
a128
,
b128
));
}
else
if
(
avx2mod
==
4
)
{
__m64
a64
,
b64
,
absa64
,
absb64
,
minabs64
;
...
...
@@ -310,11 +314,56 @@ void applyFtoleft(t_nrPolar_params *pp,decoder_node_t *node) {
absa64
=
_mm_abs_pi16
(
a64
);
absb64
=
_mm_abs_pi16
(
b64
);
minabs64
=
_mm_min_pi16
(
absa64
,
absb64
);
*
((
__m64
*
)
alpha_l
)
=
_mm_sign_pi16
(
minabs64
,
_mm_xor_si64
(
a64
,
b64
));
*
((
__m64
*
)
alpha_l
)
=
_mm_sign_pi16
(
minabs64
,
_mm_sign_pi16
(
a64
,
b64
));
}
else
#else
int
sse4mod
=
(
node
->
Nv
/
2
)
&
7
;
int
sse4len
=
node
->
Nv
/
2
/
8
;
#if defined(__arm__) || defined(__aarch64__)
int16x8_t
signatimesb
,
comp1
,
comp2
,
negminabs128
;
int16x8_t
zero
=
vdupq_n_s16
(
0
);
#endif
if
(
sse4mod
==
0
)
{
for
(
int
i
=
0
;
i
<
sse4len
;
i
++
)
{
__m128i
a128
,
b128
,
absa128
,
absb128
,
minabs128
;
int
sse4len
=
node
->
Nv
/
2
/
8
;
a128
=*
((
__m128i
*
)
alpha_v
);
b128
=
((
__m128i
*
)
alpha_v
)[
1
];
absa128
=
_mm_abs_epi16
(
a128
);
absb128
=
_mm_abs_epi16
(
b128
);
minabs128
=
_mm_min_epi16
(
absa128
,
absb128
);
#if defined(__arm__) || defined(__aarch64__)
// unfortunately no direct equivalent to _mm_sign_epi16
signatimesb
=
vxorrq_s16
(
a128
,
b128
);
comp1
=
vcltq_s16
(
signatimesb
,
zero
);
comp2
=
vcgeq_s16
(
signatimesb
,
zero
);
negminabs128
=
vnegq_s16
(
minabs128
);
*
((
__m128i
*
)
alpha_l
)
=
vorrq_s16
(
vandq_s16
(
minabs128
,
comp0
),
vandq_s16
(
negminabs128
,
comp1
));
#else
*
((
__m128i
*
)
alpha_l
)
=
_mm_sign_epi16
(
minabs128
,
_mm_sign_epi16
(
a128
,
b128
));
#endif
}
}
else
if
(
sse4mod
==
4
)
{
__m64
a64
,
b64
,
absa64
,
absb64
,
minabs64
;
a64
=*
((
__m64
*
)
alpha_v
);
b64
=
((
__m64
*
)
alpha_v
)[
1
];
absa64
=
_mm_abs_pi16
(
a64
);
absb64
=
_mm_abs_pi16
(
b64
);
minabs64
=
_mm_min_pi16
(
absa64
,
absb64
);
#if defined(__arm__) || defined(__aarch64__)
AssertFatal
(
1
==
0
,
"Need to do this still for ARM
\n
"
);
#else
*
((
__m64
*
)
alpha_l
)
=
_mm_sign_pi16
(
minabs64
,
_mm_sign_epi16
(
a64
,
b64
));
#endif
}
else
#endif
{
{
// equvalent scalar code to above, activated only on non x86/ARM architectures
for
(
int
i
=
0
;
i
<
node
->
Nv
/
2
;
i
++
)
{
a
=
alpha_v
[
i
];
b
=
alpha_v
[
i
+
(
node
->
Nv
/
2
)];
...
...
@@ -367,9 +416,34 @@ void applyGtoright(t_nrPolar_params *pp,decoder_node_t *node) {
else
if
(
avx2mod
==
8
)
{
((
__m128i
*
)
alpha_r
)[
0
]
=
_mm_subs_epi16
(((
__m128i
*
)
alpha_v
)[
1
],
_mm_sign_epi16
(((
__m128i
*
)
alpha_v
)[
0
],((
__m128i
*
)
betal
)[
0
]));
}
else
if
(
avx2mod
==
4
)
{
((
__m64
*
)
alpha_r
)[
0
]
=
_mm_subs_pi16
(((
__m64
*
)
alpha_v
)[
1
],
_mm_sign_pi16
(((
__m64
*
)
alpha_v
)[
0
],((
__m64
*
)
betal
)[
0
]));
}
else
#else
int
sse4mod
=
(
node
->
Nv
/
2
)
&
7
;
if
(
sse4mod
==
0
)
{
int
sse4len
=
node
->
Nv
/
2
/
8
;
for
(
int
i
=
0
;
i
<
sse4len
;
i
++
)
{
#if defined(__arm__) || defined(__aarch64__)
((
int16x8_t
*
)
alpha_r
)[
0
]
=
vsubq_s16
(((
int16x8_t
*
)
alpha_v
)[
1
],
vmulq_epi16
(((
int16x8_t
*
)
alpha_v
)[
0
],((
int16x8_t
*
)
betal
)[
0
]));
#else
((
__m128i
*
)
alpha_r
)[
0
]
=
_mm_subs_epi16
(((
__m128i
*
)
alpha_v
)[
1
],
_mm_sign_epi16
(((
__m128i
*
)
alpha_v
)[
0
],((
__m128i
*
)
betal
)[
0
]));
#endif
}
}
else
if
(
sse4mod
==
4
)
{
#if defined(__arm__) || defined(__aarch64__)
((
int16x4_t
*
)
alpha_r
)[
0
]
=
vsub_s16
(((
int16x4_t
*
)
alpha_v
)[
1
],
vmul_epi16
(((
int16x4_t
*
)
alpha_v
)[
0
],((
int16x4_t
*
)
betal
)[
0
]));
#else
((
__m64
*
)
alpha_r
)[
0
]
=
_mm_subs_pi16
(((
__m64
*
)
alpha_v
)[
1
],
_mm_sign_pi16
(((
__64
*
)
alpha_v
)[
0
],((
__m64
*
)
betal
)[
0
]));
#endif
}
else
#endif
{
{
// equvalent scalar code to above, activated only on non x86/ARM architectures
for
(
int
i
=
0
;
i
<
node
->
Nv
/
2
;
i
++
)
{
alpha_r
[
i
]
=
alpha_v
[
i
+
(
node
->
Nv
/
2
)]
-
(
betal
[
i
]
*
alpha_v
[
i
]);
}
...
...
@@ -385,10 +459,10 @@ void applyGtoright(t_nrPolar_params *pp,decoder_node_t *node) {
}
int16_t
minus1
[
16
]
=
{
-
1
,
-
1
,
-
1
,
-
1
,
-
1
,
-
1
,
-
1
,
-
1
,
-
1
,
-
1
,
-
1
,
-
1
,
-
1
,
-
1
,
-
1
,
-
1
};
int16_t
all1
[
16
]
=
{
1
,
1
,
1
,
1
,
1
,
1
,
1
,
1
,
1
,
1
,
1
,
1
,
1
,
1
,
1
,
1
};
void
computeBeta
(
t_nrPolar_params
*
pp
,
decoder_node_t
*
node
)
{
...
...
@@ -401,27 +475,37 @@ void computeBeta(t_nrPolar_params *pp,decoder_node_t *node) {
if
(
node
->
left
->
all_frozen
==
0
)
{
// if left node is not aggregation of frozen bits
#if defined(__AVX2__)
int
avx2mod
=
(
node
->
Nv
/
2
)
&
15
;
register
__m256i
allones
=*
((
__m256i
*
)
all1
);
if
(
avx2mod
==
0
)
{
int
avx2len
=
node
->
Nv
/
2
/
16
;
for
(
int
i
=
0
;
i
<
avx2len
;
i
++
)
{
((
__m256i
*
)
betav
)[
i
]
=
_mm256_sign_epi16
(((
__m256i
*
)
betar
)[
i
],
((
__m256i
*
)
betal
)[
i
]);
((
__m256i
*
)
betav
)[
i
]
=
_mm256_sign_epi16
(((
__m256i
*
)
betav
)[
i
],
((
__m256i
*
)
minus1
)[
0
]);
((
__m256i
*
)
betav
)[
i
]
=
_mm256_or_si256
(
_mm256_cmpeq_epi16
(((
__m256i
*
)
betar
)[
i
],
((
__m256i
*
)
betal
)[
i
]),
allones
);
}
}
else
if
(
avx2mod
==
8
)
{
((
__m128i
*
)
betav
)[
0
]
=
_mm_sign_epi16
(((
__m128i
*
)
betar
)[
0
],
((
__m128i
*
)
betal
)[
0
]);
((
__m128i
*
)
betav
)[
0
]
=
_mm_sign_epi16
(((
__m128i
*
)
betav
)[
0
],
((
__m128i
*
)
minus1
)[
0
]);
((
__m128i
*
)
betav
)[
0
]
=
_mm_or_si128
(
_mm_cmpeq_epi16
(((
__m128i
*
)
betar
)[
0
],
((
__m128i
*
)
betal
)[
0
]),
*
((
__m128i
*
)
all1
));
}
else
if
(
avx2mod
==
4
)
{
((
__m64
*
)
betav
)[
0
]
=
_mm_sign_pi16
(((
__m64
*
)
betar
)[
0
],
((
__m64
*
)
betal
)[
0
]);
((
__m64
*
)
betav
)[
0
]
=
_mm_sign_pi16
(((
__m64
*
)
betav
)[
0
],
((
__m64
*
)
minus1
)[
0
]);
((
__m64
*
)
betav
)[
0
]
=
_mm_or_si64
(
_mm_cmpeq_pi16
(((
__m64
*
)
betar
)[
0
],
((
__m64
*
)
betal
)[
0
]),
*
((
__m64
*
)
all1
));
}
else
#else
int
avx2mod
=
(
node
->
Nv
/
2
)
&
15
;
if
(
ssr4mod
==
0
)
{
int
ssr4len
=
node
->
Nv
/
2
/
8
;
register
__m128i
allones
=*
((
__m128i
*
)
all1
);
for
(
int
i
=
0
;
i
<
sse4len
;
i
++
)
{
((
__m256i
*
)
betav
)[
i
]
=
_mm_or_si128
(
_mm_cmpeq_epi16
(((
__m128i
*
)
betar
)[
i
],
((
__m128i
*
)
betal
)[
i
]),
allones
));
}
}
else
if
(
sse4mod
==
4
)
{
((
__m64
*
)
betav
)[
0
]
=
_mm_or_si64
(
_mm_cmpeq_pi16
(((
__m64
*
)
betar
)[
0
],
((
__m64
*
)
betal
)[
0
]),
*
((
__m64
*
)
all1
));
}
else
#endif
...
...
This diff is collapsed.
Click to expand it.
Write
Preview
Markdown
is supported
0%
Try again
or
attach a new file
Attach a file
Cancel
You are about to add
0
people
to the discussion. Proceed with caution.
Finish editing this message first!
Cancel
Please
register
or
sign in
to comment