Commit 76ebc5ad authored by Raymond Knopp's avatar Raymond Knopp

addition of ARM NEON intrinsics

parent 88d5bf42
develop 1 128-ues 256_QAM_demod 375-syrtem-sdr-platform 408-reworked 408-ue-main-threads 445-LDPC-implementation-on-GPU 459-pusch-based-ta-updates 464-ru_beamforming_in_gpu 464-ru_beamforming_in_gpu-CPUsubfunction 472-add-pusch-dmrs-modes 481-ldpc-decoder-on-gpu 5g_fapi_scf LTE_TRX_on_single_port NCTU_CS_ISIP NCTU_CS_ISIP_CPU NCTU_CS_ISIP_GPU NCTU_OpinConnect_LDPC NR-PHY-MAC-IF-multi-UE NR_10MHz NR_CSI_reporting NR_DCI_01 NR_DL_sched_fixes NR_DL_scheduler NR_FAPI_beamindex_SSB_RO NR_FR2_RA NR_FR2_RRC_SSB NR_MAC_CE_GlobalEdge NR_MAC_Multi_Rach_GlobalEdge NR_MAC_SSB_RO_GlobalEdge NR_MAC_SSB_RO_UE_IDCC NR_MAC_SSB_RO_merge NR_MAC_TCI_UCI_GlobalEdge NR_NGAP NR_PDCP_noS1 NR_PUCCH_MultiUE NR_RA_updates NR_RRCConfiguragion_FR2 NR_RRCConfiguration NR_RRCConfiguration_FR2 NR_RRCConfiguration_S1U NR_RRCConfiguration_merge_develop NR_RRCConfiguration_sync_source NR_RRCConfiguration_trx_thread NR_RRC_CP_bugfix NR_RRC_PDCP NR_RRC_PRACH_procedures NR_RRC_PRACH_procedures_todevelop NR_RRC_PUSCH NR_RRC_TA NR_RRC_X2AP_AMBR_Change_Global_edge NR_RRC_X2AP_RemoveHardcodings_GlobalEdge NR_RRC_config_simplified NR_RRC_harq NR_RRC_harq_b NR_RRC_harq_hacks NR_RRC_harq_newdcipdu NR_SA_NGAP_RRC NR_SA_NGAP_RRC_wk42 NR_SA_itti_sim_wk48 NR_SCHED NR_SRB_Config NR_TRX_on_single_port NR_TRX_on_single_port2 NR_UE_MAC_scheduler NR_UE_RA_fixes NR_UE_UL_DCI_improvements NR_UE_enable_parallelization NR_UE_stability_fixes NR_UL_FAPI_programming NR_UL_scheduler NR_UL_scheduler_rebased NR_UL_scheduling NR_beamforming_test NR_gNB_SCF_Indication NR_ipaccess_testing NR_mac_uci_functions_rework NR_msg2_phytest NR_scheduling_CSIRS NR_scheduling_request NR_test_S1U_RRC_PRACH_procedures NR_ue_dlsch_dmrs_cdm OpInConnect_ISIP PUSCH_TA_update RA_CI_test UE_DL_DCI_hotfix addoptions_nr_USRPdevice bch-fixes-bitmap benetel_5g_prach_fix benetel_phase_rotation benetel_phase_rotation_old bugfix-minor-remove-wrong-log bugfix-nr-bands bugfix-nr-ldpc-post-processing bugfix-nr-ldpc-size-typo bugfix-nr-pdcp-sn-size bugfix-nr-rate-matching-assertion cce_indexing_fix cce_indexing_fix2 ci-deploy-docker-compose ci-rd-july-improvements ci-ul-iperf-from-trf-container clean-5G-scope-round2 cleanup_softmodem_main constant_power debug_branch_init_sync develop-ci develop-nr develop-nr-adding-2018-09-asn1 develop-nr-fr2 develop-nr-fr2-rework develop-nr_cppcheck develop-oriecpriupdates develop-sib1 develop_inria_ci_deployment develop_inria_ci_deployment_gp develop_integration_2020_w15 develop_integration_2020_w19 develop_integration_w08 dfts_alternatives dlsch-all-dlslots dlsch_encode_mthread dlsch_parallel docupdate_tools dongzhanyi-zte-develop dongzhanyi-zte-develop2 dreibh/apt-auth-fix dreibh/device-load-fix dreibh/device-load-fix-develop-branch dual-connectivity edrx extend_sharedlibusage extend_sharedlibusage2 fapi_for_dmrs_and_ptrs feat-mac-sock feature-4g-sched feature-nr-4g-nfapi-modifications feature-support-clang-format feature/make-s1-mme-port-configurable feature/make-s1-mme-port-configurable-with-astyle-fixes fembms-enb-ue finalize-oaicn-integration firas fix-ci-tun fix-clock-source fix-itti-segv fix-l2-sim fix-limeSDR-compile fix-softmodem-restart fix-warnings fix_do_ra_data fix_pdsch_low_prb fix_rfsim_mimo fix_rrc_x2_ticking fixes-mac-sched-nfapi fixes-mac-sched-tun fixes-tun flexran-apps flexran-improvements flexran-repair-mme-mgmt fr2-hw-test fujitsu_lte_contribution fujitsu_lte_contribution-128 gNB-nrUE-USRP generate_push_ptrs harq-hotfix hotfix-minor-remove-nr-rlc-cppcheck-error hotfix-nr-rlc-tick hotfix-ocp-executable hotfix-ue-musim-compilation hotfix_usrp_lib improve_build_nr_lte_merge improve_nr_modulation improve_ue_stability integration-develop-nr-2019w45 integration_2020_wk40 integration_2020_wk41 integration_2020_wk42_2 integration_2020_wk45 integration_2020_wk45_2 integration_2020_wk46 integration_2020_wk46_2 integration_2020_wk47 integration_2020_wk48 integration_2020_wk48_2 integration_2020_wk49 integration_2020_wk50 integration_2020_wk50_1 inter-RRU-final inter-RRU-nr inter-RRU-oairu inter-rru-UE interoperability-test isip_nr l2-fixes ldpc-dec-layering ldpc-decoder-codegen ldpc-decoder-codegen2 ldpc-decoder-improvements ldpc-offload ldpc_short_codeword_fixes load_gnb lte_uplink_improvement mac-fixes-wk45_2 mbms-fix-develop-nr merging-2019-w51-to-develop-nr mosaic5g-oai-ran mosaic5g-oai-sim new_rlc_2020 nfapi-bugfix nfapi_nr_develop ngap-dlul ngap-support ngap-w48-merge2 ngap-wf ngap-wf-1120 ngap-wf-1120-srb ngap-wf-1120-srb-gtp ngap-wf-1120-srb-gtp-hs ngap-wf-1120-srb-gtp-hs1 ngap-wf-1120-srb-gtp-hs2 ngap-wf-1203-yunsdr ngap-wf-liuyu ngap_lfq_1120 ngap_merge noCore nr-coreset-bug-fix nr-dlsch-multi-thread nr-dlsch-thread nr-dual-connectivity nr-interdigital-test nr-ip-uplink-noS1 nr-mac-pdu-wireshark nr-mac-remove-ue-list nr-multiple-ssb nr-pdcp nr-pdsch-extraction-bugfix nr-physim-update nr-rlc-am-bugfix-w44 nr-rlc-bugfix-w44 nr-ssb-measurements nr-timing-measurement nr-timing-measurement-merge nr-ue-buffer-status nr-ue-slot-based nr-uldci nrUE nrUE-hs nrUE-upper-layer nr_beamforming nr_bsr nr_ci_dlsim nr_csi_newbranch nr_dci_procedures nr_demo_wsa2019 nr_dl_dmrs_type2 nr_dl_pf nr_dl_pf2 nr_dl_ul_ptrs nr_dlsch_parallel_measurements nr_dlsim_plot nr_fapi_for_push_tmp nr_fdd_if_fix nr_fix_easycppcheck nr_flexible_NRBDL nr_improve_build_procedures nr_increase_tp nr_mib_vsa_test nr_pdcch_testing nr_pdcch_updates nr_pdsch_integration nr_polar_decoder_improvement nr_prach nr_prach_fr2 nr_pucch nr_pucch2 nr_segmentation_fixes nr_sim_fix nr_tdd_configuration nr_ue_msg3 nr_ue_tti_cleanup nr_vcd nrue-multi-thread nrue_msg2_reception nsa-ue nsa_remove_band_hardcodings oai-sim oai-ubuntu-docker oai-ubuntu-docker-for-lmssdr oairu oc-docker-october-improvements openxg/develop pdcp-benchmark pdsch-ch-est polar8 ptrs_rrc_config pusch-mthread-scaling-fix ra-dl-ul reduce_memory_footprint remove-ci-workaround remove_nos1_hack_pdcp remove_x2_gnb_hardcoding repair-TA revert-f5c94279 revert_memcpy rh_ci_fix_autoterminate rh_ci_fr1_update rh_ci_oc rh_ci_py rh_ci_rfsim_ra rh_doc_update_3 rh_fr1_newjenkins rh_fr1_update rh_gnb_compile_fix rh_wk50_debug rlc-v2-bugfix-status-reporting rlc-v2-tick rlc_v2_coverity_fixes rrc-enb-phy-testmode ru-parallel-beamforming runel runel-reverse-test s1_subnormal s1ap-bugfix-rab_setup small-bugfixes-w40 smallcleanup softmodem_cleanup split73 test-x310-perf testing_2symb_pdcch testing_with_external_txdata thread-pool tools_5Gadapt tp-ota-test trx_thread_param trx_write_thread ue-csi ue-fixes-ota ue-updates-runel-test ue_adjust_gain ue_dlsch-multi-threading ue_dlsch_decoding_ldpc_offload ue_nfapi_mch uhd_priority_set_cleanup ul_dl_dci_same_slot ul_harq ulsch_decode_mthread ulsim_changes update-to-2019-march-june-release usrp_fix_adc_shift_and_pps_sync usrp_gpio_test x2-endc-processing yihongzheng_srb zzs
No related merge requests found
......@@ -254,6 +254,18 @@ void build_decoder_tree(t_nrPolar_params *pp) {
}
#if defined(__arm__) || defined(__aarch64__)
// translate 1-1 SIMD functions from SSE to NEON
#define __m128i int16x8_t
#define __m64 int8x8_t
#define _mm_abs_epi16(a) vabsq_s16(a)
#define _mm_min_epi16(a,b) vminq_s16(a,b)
#define _mm_subs_epi16(a,b) vsubq_s16(a,b)
#define _mm_abs_pi16(a) vabs_s16(a)
#define _mm_min_pi16(a,b) vmin_s16(a,b)
#define _mm_subs_pi16(a,b) vsub_s16(a,b)
#endif
void applyFtoleft(t_nrPolar_params *pp,decoder_node_t *node) {
int16_t *alpha_v=node->alpha;
int16_t *alpha_l=node->left->alpha;
......@@ -270,7 +282,6 @@ void applyFtoleft(t_nrPolar_params *pp,decoder_node_t *node) {
if (node->left->all_frozen == 0) {
#if defined(__AVX2__)
int avx2mod = (node->Nv/2)&15;
if (avx2mod == 0) {
......@@ -284,14 +295,7 @@ void applyFtoleft(t_nrPolar_params *pp,decoder_node_t *node) {
absa256 =_mm256_abs_epi16(a256);
absb256 =_mm256_abs_epi16(b256);
minabs256 =_mm256_min_epi16(absa256,absb256);
((__m256i*)alpha_l)[i] =_mm256_sign_epi16(minabs256,_mm256_xor_si256(a256,b256));
/* for (int j=0;j<16;j++) printf("alphal[%d] %d (%d,%d,%d)\n",
(16*i) + j,
alpha_l[(16*i)+j],
((int16_t*)&minabs256)[j],
alpha_v[(16*i)+j],
alpha_v[(16*i)+j+(node->Nv/2)]);
*/
((__m256i*)alpha_l)[i] =_mm256_sign_epi16(minabs256,_mm256_sign_epi16(a256,b256));
}
}
else if (avx2mod == 8) {
......@@ -301,7 +305,7 @@ void applyFtoleft(t_nrPolar_params *pp,decoder_node_t *node) {
absa128 =_mm_abs_epi16(a128);
absb128 =_mm_abs_epi16(b128);
minabs128 =_mm_min_epi16(absa128,absb128);
*((__m128i*)alpha_l) =_mm_sign_epi16(minabs128,_mm_xor_si128(a128,b128));
*((__m128i*)alpha_l) =_mm_sign_epi16(minabs128,_mm_sign_epi16(a128,b128));
}
else if (avx2mod == 4) {
__m64 a64,b64,absa64,absb64,minabs64;
......@@ -310,11 +314,56 @@ void applyFtoleft(t_nrPolar_params *pp,decoder_node_t *node) {
absa64 =_mm_abs_pi16(a64);
absb64 =_mm_abs_pi16(b64);
minabs64 =_mm_min_pi16(absa64,absb64);
*((__m64*)alpha_l) =_mm_sign_pi16(minabs64,_mm_xor_si64(a64,b64));
*((__m64*)alpha_l) =_mm_sign_pi16(minabs64,_mm_sign_pi16(a64,b64));
}
else
#else
int sse4mod = (node->Nv/2)&7;
int sse4len = node->Nv/2/8;
#if defined(__arm__) || defined(__aarch64__)
int16x8_t signatimesb,comp1,comp2,negminabs128;
int16x8_t zero=vdupq_n_s16(0);
#endif
if (sse4mod == 0) {
for (int i=0;i<sse4len;i++) {
__m128i a128,b128,absa128,absb128,minabs128;
int sse4len = node->Nv/2/8;
a128 =*((__m128i*)alpha_v);
b128 =((__m128i*)alpha_v)[1];
absa128 =_mm_abs_epi16(a128);
absb128 =_mm_abs_epi16(b128);
minabs128 =_mm_min_epi16(absa128,absb128);
#if defined(__arm__) || defined(__aarch64__)
// unfortunately no direct equivalent to _mm_sign_epi16
signatimesb=vxorrq_s16(a128,b128);
comp1=vcltq_s16(signatimesb,zero);
comp2=vcgeq_s16(signatimesb,zero);
negminabs128=vnegq_s16(minabs128);
*((__m128i*)alpha_l) =vorrq_s16(vandq_s16(minabs128,comp0),vandq_s16(negminabs128,comp1));
#else
*((__m128i*)alpha_l) =_mm_sign_epi16(minabs128,_mm_sign_epi16(a128,b128));
#endif
}
}
else if (sse4mod == 4) {
__m64 a64,b64,absa64,absb64,minabs64;
a64 =*((__m64*)alpha_v);
b64 =((__m64*)alpha_v)[1];
absa64 =_mm_abs_pi16(a64);
absb64 =_mm_abs_pi16(b64);
minabs64 =_mm_min_pi16(absa64,absb64);
#if defined(__arm__) || defined(__aarch64__)
AssertFatal(1==0,"Need to do this still for ARM\n");
#else
*((__m64*)alpha_l) =_mm_sign_pi16(minabs64,_mm_sign_epi16(a64,b64));
#endif
}
else
#endif
{
{ // equvalent scalar code to above, activated only on non x86/ARM architectures
for (int i=0;i<node->Nv/2;i++) {
a=alpha_v[i];
b=alpha_v[i+(node->Nv/2)];
......@@ -367,9 +416,34 @@ void applyGtoright(t_nrPolar_params *pp,decoder_node_t *node) {
else if (avx2mod == 8) {
((__m128i *)alpha_r)[0] = _mm_subs_epi16(((__m128i *)alpha_v)[1],_mm_sign_epi16(((__m128i *)alpha_v)[0],((__m128i *)betal)[0]));
}
else if (avx2mod == 4) {
((__m64 *)alpha_r)[0] = _mm_subs_pi16(((__m64 *)alpha_v)[1],_mm_sign_pi16(((__m64 *)alpha_v)[0],((__m64 *)betal)[0]));
}
else
#else
int sse4mod = (node->Nv/2)&7;
if (sse4mod == 0) {
int sse4len = node->Nv/2/8;
for (int i=0;i<sse4len;i++) {
#if defined(__arm__) || defined(__aarch64__)
((int16x8_t *)alpha_r)[0] = vsubq_s16(((int16x8_t *)alpha_v)[1],vmulq_epi16(((int16x8_t *)alpha_v)[0],((int16x8_t *)betal)[0]));
#else
((__m128i *)alpha_r)[0] = _mm_subs_epi16(((__m128i *)alpha_v)[1],_mm_sign_epi16(((__m128i *)alpha_v)[0],((__m128i *)betal)[0]));
#endif
}
}
else if (sse4mod == 4) {
#if defined(__arm__) || defined(__aarch64__)
((int16x4_t *)alpha_r)[0] = vsub_s16(((int16x4_t *)alpha_v)[1],vmul_epi16(((int16x4_t *)alpha_v)[0],((int16x4_t *)betal)[0]));
#else
((__m64 *)alpha_r)[0] = _mm_subs_pi16(((__m64 *)alpha_v)[1],_mm_sign_pi16(((__64 *)alpha_v)[0],((__m64 *)betal)[0]));
#endif
}
else
#endif
{
{// equvalent scalar code to above, activated only on non x86/ARM architectures
for (int i=0;i<node->Nv/2;i++) {
alpha_r[i] = alpha_v[i+(node->Nv/2)] - (betal[i]*alpha_v[i]);
}
......@@ -385,10 +459,10 @@ void applyGtoright(t_nrPolar_params *pp,decoder_node_t *node) {
}
int16_t minus1[16] = {-1,-1,-1,-1,
-1,-1,-1,-1,
-1,-1,-1,-1,
-1,-1,-1,-1};
int16_t all1[16] = {1,1,1,1,
1,1,1,1,
1,1,1,1,
1,1,1,1};
void computeBeta(t_nrPolar_params *pp,decoder_node_t *node) {
......@@ -401,27 +475,37 @@ void computeBeta(t_nrPolar_params *pp,decoder_node_t *node) {
if (node->left->all_frozen==0) { // if left node is not aggregation of frozen bits
#if defined(__AVX2__)
int avx2mod = (node->Nv/2)&15;
register __m256i allones=*((__m256i*)all1);
if (avx2mod == 0) {
int avx2len = node->Nv/2/16;
for (int i=0;i<avx2len;i++) {
((__m256i*)betav)[i] = _mm256_sign_epi16(((__m256i*)betar)[i],
((__m256i*)betal)[i]);
((__m256i*)betav)[i] = _mm256_sign_epi16(((__m256i*)betav)[i],
((__m256i*)minus1)[0]);
((__m256i*)betav)[i] = _mm256_or_si256(_mm256_cmpeq_epi16(((__m256i*)betar)[i],
((__m256i*)betal)[i]),allones);
}
}
else if (avx2mod == 8) {
((__m128i*)betav)[0] = _mm_sign_epi16(((__m128i*)betar)[0],
((__m128i*)betal)[0]);
((__m128i*)betav)[0] = _mm_sign_epi16(((__m128i*)betav)[0],
((__m128i*)minus1)[0]);
((__m128i*)betav)[0] = _mm_or_si128(_mm_cmpeq_epi16(((__m128i*)betar)[0],
((__m128i*)betal)[0]),*((__m128i*)all1));
}
else if (avx2mod == 4) {
((__m64*)betav)[0] = _mm_sign_pi16(((__m64*)betar)[0],
((__m64*)betal)[0]);
((__m64*)betav)[0] = _mm_sign_pi16(((__m64*)betav)[0],
((__m64*)minus1)[0]);
((__m64*)betav)[0] = _mm_or_si64(_mm_cmpeq_pi16(((__m64*)betar)[0],
((__m64*)betal)[0]),*((__m64*)all1));
}
else
#else
int avx2mod = (node->Nv/2)&15;
if (ssr4mod == 0) {
int ssr4len = node->Nv/2/8;
register __m128i allones=*((__m128i*)all1);
for (int i=0;i<sse4len;i++) {
((__m256i*)betav)[i] = _mm_or_si128(_mm_cmpeq_epi16(((__m128i*)betar)[i],
((__m128i*)betal)[i]),allones));
}
}
else if (sse4mod == 4) {
((__m64*)betav)[0] = _mm_or_si64(_mm_cmpeq_pi16(((__m64*)betar)[0],
((__m64*)betal)[0]),*((__m64*)all1));
}
else
#endif
......
Markdown is supported
0%
or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment