Commit 37c1219e authored by Raymond Knopp's avatar Raymond Knopp

AVX2 optimization of 16QAM NR ULSCH LLR

parent fc21364f
develop 1 128-ues 256_QAM_demod NR-PHY-MAC-IF-multi-UE NRUE_usedlschparallel NR_10MHz NR_CSI_reporting NR_DLUL_PF NR_DLUL_PF_4UL NR_DLUL_PF_rebased NR_DL_MIMO NR_DL_sched_fixes NR_DL_scheduler NR_FAPI_beamindex_SSB_RO NR_FDD_FIX NR_FR2_RA NR_FR2_RRC_SSB NR_MAC_Multi_Rach_GlobalEdge NR_MAC_SSB_RO_GlobalEdge NR_MAC_SSB_RO_UE_IDCC NR_MAC_SSB_RO_merge NR_MAC_TCI_UCI_GlobalEdge NR_NGAP NR_PUCCH_MultiUE NR_RA_cleanup NR_RA_updates NR_RRCConfiguration_FR2 NR_SA_F1AP_RFSIMULATOR NR_SA_F1AP_RFSIMULATOR2 NR_SA_F1AP_dev NR_SA_NGAP_RRC NR_SA_NGAP_RRC_wk42 NR_SA_itti_sim_wk48 NR_SA_itti_sim_wk48_hs NR_SA_itti_sim_wk48_hs1 NR_SCHED_HARQ NR_SCHED_PDCCH_PUCCH_HARQ NR_SCHED_PDCCH_PUCCH_HARQ_rebased NR_SCHED_fixes NR_SRB_Config NR_UE_MAC_scheduler NR_UE_RA_fixes NR_UE_UL_DCI_improvements NR_UE_dlsch_bugfix NR_UE_enable_parallelization NR_UE_stability_fixes NR_UL_scheduler NR_UL_scheduler_rebased NR_Wireshark NR_beam_simulation NR_mac_uci_functions_rework NR_scheduling_CSIRS NR_scheduling_request NR_scheduling_request2 NR_scheduling_request3 PUSCH_TA_update RA_CI_test UE_DL_DCI_hotfix bch-fixes-bitmap benetel_5g_prach_fix benetel_driver_uldl_pf_merge benetel_driver_update benetel_phase_rotation benetel_phase_rotation_old bugfix-nr-bands bugfix-nr-ldpc-post-processing bugfix-nr-ldpc-size-typo bugfix-nr-pdcp-sn-size bugfix-nr-rate-matching-assertion bugfix-x2-SgNBAdditionRequest bupt-sa-merge cce_indexing_fix cce_indexing_fix2 ci-deploy-asterix ci-deploy-docker-compose ci-test ci-ul-iperf-from-trf-container cleanup_softmodem_main constant_power develop-SA-RA develop-SnT develop-oriecpriupdates develop-sib1 develop-sib1-local develop-sib1-lts develop-sib1-update develop-sib1-update-test1 develop-sib1-update-ue develop_stable dfts_alternatives dlsch_encode_mthread dlsch_parallel dongzhanyi-zte-develop2 feature/make-s1-mme-port-configurable feature/make-s1-mme-port-configurable-with-astyle-fixes finalize-oaicn-integration fix-ci-tun fix-itti-segv fix_NR_DLUL_PF fix_do_ra_data fix_pdsch_low_prb fix_rrc_x2_ticking fixes-CE-RLC-PDU-size fixes-mac-sched-nfapi fixes-mac-sched-tun fixes-tun flexran-apps flexran-repair-mme-mgmt fujitsu_lte_contribution fujitsu_lte_contribution-128 gnb-only-test harq-hotfix hotfix-minor-remove-nr-rlc-cppcheck-error hotfix-nr-rlc-tick integration_2020_wk40 integration_2020_wk41 integration_2020_wk42_2 integration_2020_wk45 integration_2020_wk45_2 integration_2020_wk46 integration_2020_wk46_2 integration_2020_wk47 integration_2020_wk48 integration_2020_wk48_2 integration_2020_wk49 integration_2020_wk50 integration_2020_wk50_1 integration_2020_wk51 integration_2020_wk51_2 integration_2021_wk02 integration_2021_wk02_wMR988 integration_2021_wk04 integration_2021_wk05 inter-RRU-final itti-enhancement ldpc-decoder-codegen2 ldpc_short_codeword_fixes load_gnb lte-ulsch-bugfix lte_uplink_improvement mac-fixes-wk45_2 minor-fix-doc-basic-sim mosaic5g-oai-ran mosaic5g-oai-sim nasmesh_kernel_5.8 new-gtpu nfapi_nr_develop ngap-dlul ngap-w48-merge2 ngap-wf ngap-wf-1120 ngap-wf-1120-srb ngap-wf-1120-srb-gtp ngap-wf-1120-srb-gtp-hs ngap-wf-1120-srb-gtp-hs1 ngap-wf-1120-srb-gtp-hs2 ngap-wf-1120-srb-gtp-yhz ngap-wf-1203-yunsdr ngap-wf-liuyu ngap_lfq_1120 ngap_merge noCore nr-mac-pdu-wireshark nr-mac-remove-ue-list nr-rlc-am-bugfix-w44 nr-rlc-bugfix-w44 nrUE nrUE-hs nrUE-upper-layer nr_bsr nr_dl_dmrs_type2 nr_dl_pf nr_dl_pf2 nr_dl_ul_ptrs nr_fdd_if_fix nr_prach_fr2 nr_ue_msg3 nr_ue_tti_cleanup nr_ul_pf nr_ul_scfdma nsa-ue nsa_remove_band_hardcodings oai-sim oairu oairu-dockerfile-support oc-docker-october-improvements openxg/develop ptrs_rrc_config pusch-mthread-scaling-fix pusch-retrans-fix-ue ra-dl-ul remove_nos1_hack_pdcp remove_x2_gnb_hardcoding repair-TA revert_memcpy rh-ci-add-ue-parallelization rh_ci_fix_autoterminate rh_ci_fr1_update rh_ci_oc rh_ci_py rh_ci_ra_fr2 rh_ci_rfsim_ra rh_ci_ue_parallel rh_fr1_newjenkins rh_fr1_update rh_gnb_compile_fix rh_wk50_debug rlc-v2-bugfix-status-reporting rlc-v2-tick rrc-enb-phy-testmode s1-subnormal_rewrite s1_subnormal s1_subnormal-robert s1ap-bugfix-rab_setup sa-demo sa-merge-rrc-srb sa-msg4 sa-msg4-rrc sa-msg4-rrc-yihz sa-msg4-rrc-yihz-hs sa_rrc_yihz small-bugfixes-w40 small-config-change testing_with_external_txdata ue-fixes ue_beam_selection ul-freq-iq-samps-to-file ul_dl_dci_same_slot ulsch_decode_mthread ulsim_changes wireshark-T-hack-ueid wireshark-log-scheduling-requests x2-endc-processing xiangwab xiangwan yihongzheng_srb 2021.w04 2021.w02 2020.w51_2 2020.w51 2020.w50 2020.w49 2020.w48_2 2020.w48 2020.w47 2020.w46_2 2020.w46 2020.w45_2 2020.w45 2020.w44 2020.w42_2 2020.w42 2020.w41 2020.w39 2020.w38 2020.w37 2020.w36 benetel_phase_rotation
No related merge requests found
......@@ -257,16 +257,22 @@ void nr_ulsch_extract_rbs_single(int32_t **rxdataF,
uint8_t is_dmrs_re=0,is_ptrs_re=0;
start_re = (frame_parms->first_carrier_offset + (pusch_pdu->rb_start * NR_NB_SC_PER_RB))%frame_parms->ofdm_symbol_size;
nb_re_pusch = NR_NB_SC_PER_RB * pusch_pdu->rb_size;
#ifdef __AVX2__
int nb_re_pusch2 = nb_re_pusch + (nb_re_pusch&7);
#else
int nb_re_pusch2 = nb_re_pusch;
#endif
num_ptrs_symbols = 0;
for (aarx = 0; aarx < frame_parms->nb_antennas_rx; aarx++) {
rxF = (int16_t *)&rxdataF[aarx][symbol * frame_parms->ofdm_symbol_size];
rxF_ext = (int16_t *)&pusch_vars->rxdataF_ext[aarx][symbol * nb_re_pusch]; // [hna] rxdataF_ext isn't contiguous in order to solve an alignment problem ib llr computation in case of mod_order = 4, 6
rxF_ext = (int16_t *)&pusch_vars->rxdataF_ext[aarx][symbol * nb_re_pusch2]; // [hna] rxdataF_ext isn't contiguous in order to solve an alignment problem ib llr computation in case of mod_order = 4, 6
ul_ch0 = &pusch_vars->ul_ch_estimates[aarx][pusch_vars->dmrs_symbol*frame_parms->ofdm_symbol_size]; // update channel estimates if new dmrs symbol are available
ul_ch0_ext = &pusch_vars->ul_ch_estimates_ext[aarx][symbol*nb_re_pusch];
ul_ch0_ext = &pusch_vars->ul_ch_estimates_ext[aarx][symbol*nb_re_pusch2];
ul_ch0_ptrs = &pusch_vars->ul_ch_ptrs_estimates[aarx][pusch_vars->ptrs_symbol_index*frame_parms->ofdm_symbol_size]; // update channel estimates if new dmrs symbol are available
......@@ -366,10 +372,16 @@ void nr_ulsch_scale_channel(int **ul_ch_estimates_ext,
ch_amp128 = _mm_set1_epi16(ch_amp); // Q3.13
#ifdef __AVX2__
int off = ((nb_rb&1) == 1)? 4:0;
#else
int off = 0;
#endif
for (aatx=0; aatx < frame_parms->nb_antenna_ports_gNB; aatx++) {
for (aarx=0; aarx < frame_parms->nb_antennas_rx; aarx++) {
ul_ch128 = (__m128i *)&ul_ch_estimates_ext[aarx][symbol*nb_rb*NR_NB_SC_PER_RB];
ul_ch128 = (__m128i *)&ul_ch_estimates_ext[aarx][symbol*(off+(nb_rb*NR_NB_SC_PER_RB))];
if (is_dmrs_symbol==1){
if (pusch_dmrs_type == pusch_dmrs_type1)
......@@ -418,12 +430,18 @@ void nr_ulsch_channel_level(int **ul_ch_estimates_ext,
int16_t x = factor2(len);
int16_t y = (len)>>x;
#ifdef __AVX2__
int off = ((nb_rb&1) == 1)? 4:0;
#else
int off = 0;
#endif
for (aatx = 0; aatx < frame_parms->nb_antennas_tx; aatx++)
for (aarx = 0; aarx < frame_parms->nb_antennas_rx; aarx++) {
//clear average level
avg128U = _mm_setzero_si128();
ul_ch128=(__m128i *)&ul_ch_estimates_ext[(aatx<<1)+aarx][symbol*nb_rb*12];
ul_ch128=(__m128i *)&ul_ch_estimates_ext[(aatx<<1)+aarx][symbol*(off+(nb_rb*12))];
for (rb = 0; rb < len/12; rb++) {
avg128U = _mm_add_epi32(avg128U, _mm_srai_epi32(_mm_madd_epi16(ul_ch128[0], ul_ch128[0]), x));
......@@ -511,13 +529,13 @@ void nr_ulsch_channel_compensation(int **rxdataF_ext,
unsigned short nb_rb,
unsigned char output_shift) {
#ifdef DEBUG_CH_COMP
int16_t *rxF, *ul_ch;
int prnt_idx;
rxF = (int16_t *)&rxdataF_ext[0][(symbol*nb_rb*12)];
ul_ch = (int16_t *)&ul_ch_estimates_ext[0][symbol*nb_rb*12];
rxF = (int16_t *)&rxdataF_ext[0][symbol*(off+(nb_rb*12))];
ul_ch = (int16_t *)&ul_ch_estimates_ext[0][symbol*(off+(nb_rb*1))2];
printf("--------------------symbol = %d, mod_order = %d, output_shift = %d-----------------------\n", symbol, mod_order, output_shift);
printf("----------------Before compensation------------------\n");
......@@ -536,7 +554,7 @@ void nr_ulsch_channel_compensation(int **rxdataF_ext,
int print_idx;
ch_mag = (int16_t *)&ul_ch_mag[0][(symbol*nb_rb*12)];
ch_mag = (int16_t *)&ul_ch_mag[0][symbol*(off+(nb_rb*12))];
printf("--------------------symbol = %d, mod_order = %d-----------------------\n", symbol, mod_order);
printf("----------------Before computation------------------\n");
......@@ -549,7 +567,13 @@ void nr_ulsch_channel_compensation(int **rxdataF_ext,
#endif
#if defined(__i386) || defined(__x86_64)
#ifdef __AVX2__
int off = ((nb_rb&1) == 1)? 4:0;
#else
int off = 0;
#endif
#if defined(__i386) || defined(__x86_64__)
unsigned short rb;
unsigned char aatx,aarx;
......@@ -571,11 +595,11 @@ void nr_ulsch_channel_compensation(int **rxdataF_ext,
for (aarx=0; aarx<frame_parms->nb_antennas_rx; aarx++) {
ul_ch128 = (__m128i *)&ul_ch_estimates_ext[(aatx<<1)+aarx][symbol*nb_rb*12];
ul_ch_mag128 = (__m128i *)&ul_ch_mag[(aatx<<1)+aarx][symbol*nb_rb*12];
ul_ch_mag128b = (__m128i *)&ul_ch_magb[(aatx<<1)+aarx][symbol*nb_rb*12];
rxdataF128 = (__m128i *)&rxdataF_ext[aarx][symbol*nb_rb*12];
rxdataF_comp128 = (__m128i *)&rxdataF_comp[(aatx<<1)+aarx][symbol*nb_rb*12];
ul_ch128 = (__m128i *)&ul_ch_estimates_ext[(aatx<<1)+aarx][symbol*(off+(nb_rb*12))];
ul_ch_mag128 = (__m128i *)&ul_ch_mag[(aatx<<1)+aarx][symbol*(off+(nb_rb*12))];
ul_ch_mag128b = (__m128i *)&ul_ch_magb[(aatx<<1)+aarx][symbol*(off+(nb_rb*12))];
rxdataF128 = (__m128i *)&rxdataF_ext[aarx][symbol*(off+(nb_rb*12))];
rxdataF_comp128 = (__m128i *)&rxdataF_comp[(aatx<<1)+aarx][symbol*(off+(nb_rb*12))];
for (rb=0; rb<nb_rb; rb++) {
......@@ -991,7 +1015,7 @@ void nr_ulsch_channel_compensation(int **rxdataF_ext,
#ifdef DEBUG_CH_COMP
rxF = (int16_t *)&rxdataF_comp[0][(symbol*nb_rb*12)];
rxF = (int16_t *)&rxdataF_comp[0][(symbol*(off+(nb_rb*12)))];
printf("----------------After compansation------------------\n");
......@@ -1006,7 +1030,7 @@ void nr_ulsch_channel_compensation(int **rxdataF_ext,
#ifdef DEBUG_CH_MAG
ch_mag = (int16_t *)&ul_ch_mag[0][(symbol*nb_rb*12)];
ch_mag = (int16_t *)&ul_ch_mag[0][(symbol*(off+(nb_rb*12)))];
printf("----------------After computation------------------\n");
......@@ -1178,7 +1202,12 @@ int nr_rx_pusch(PHY_VARS_gNB *gNB,
//----------------------------------------------------------
start_meas(&gNB->ulsch_llr_stats);
AssertFatal(gNB->pusch_vars[ulsch_id]->rxdataF_ext_offset * rel15_ul->qam_mod_order+nb_re_pusch*rel15_ul->qam_mod_order < (8*((3*8*6144)+12)) , "Mysterious llr buffer size check");
nr_ulsch_compute_llr(&gNB->pusch_vars[ulsch_id]->rxdataF_comp[0][symbol * rel15_ul->rb_size * NR_NB_SC_PER_RB],
#ifdef __AVX2__
int off = ((rel15_ul->rb_size&1) == 1)? 4:0;
#else
int off = 0;
#endif
nr_ulsch_compute_llr(&gNB->pusch_vars[ulsch_id]->rxdataF_comp[0][symbol * (off+(rel15_ul->rb_size * NR_NB_SC_PER_RB))],
gNB->pusch_vars[ulsch_id]->ul_ch_mag0,
gNB->pusch_vars[ulsch_id]->ul_ch_magb0,
&gNB->pusch_vars[ulsch_id]->llr[gNB->pusch_vars[ulsch_id]->rxdataF_ext_offset * rel15_ul->qam_mod_order],
......
......@@ -34,9 +34,6 @@
#include "PHY/sse_intrin.h"
#include "PHY/impl_defs_top.h"
__m128i xmm0 __attribute__ ((aligned(32)));
__m128i xmm1 __attribute__ ((aligned(32)));
__m128i xmm2 __attribute__ ((aligned(32)));
//----------------------------------------------------------------------------------------------
......@@ -47,20 +44,19 @@ void nr_ulsch_qpsk_llr(int32_t *rxdataF_comp,
uint32_t nb_re,
uint8_t symbol)
{
int i;
uint32_t *rxF = (uint32_t*)rxdataF_comp;
uint32_t *llr32 = (uint32_t*)ulsch_llr;
if (!llr32) {
LOG_E(PHY,"nr_ulsch_qpsk_llr: llr is null, symbol %d, llr32 = %p\n",symbol, llr32);
}
/*
for (i = 0; i < nb_re; i++) {
*llr32 = *rxF;
rxF++;
llr32++;
}
}*/
memcpy1((void*)llr32,(void*)rxF,nb_re<<2);
}
//----------------------------------------------------------------------------------------------
......@@ -76,9 +72,17 @@ void nr_ulsch_16qam_llr(int32_t *rxdataF_comp,
{
#if defined(__x86_64__) || defined(__i386__)
#ifdef __AVX2__
__m256i *rxF = (__m256i*)rxdataF_comp;
__m256i *ch_mag;
__m256i llr256[2];
register __m256i xmm0;
#else
__m128i *rxF = (__m128i*)rxdataF_comp;
__m128i *ch_mag;
__m128i llr128[2];
register __m128i xmm0;
#endif
uint32_t *llr32;
#elif defined(__arm__)
......@@ -90,7 +94,12 @@ void nr_ulsch_16qam_llr(int32_t *rxdataF_comp,
int i;
unsigned char len_mod4 = 0;
#ifdef __AVX2__
int off = ((nb_rb&1) == 1)? 4:0;
#else
int off = 0;
#endif
#if defined(__x86_64__) || defined(__i386__)
......@@ -100,22 +109,69 @@ void nr_ulsch_16qam_llr(int32_t *rxdataF_comp,
#endif
#if defined(__x86_64__) || defined(__i386__)
ch_mag = (__m128i*)&ul_ch_mag[0][(symbol*nb_rb*12)];
#ifdef __AVX2__
ch_mag = (__m256i*)&ul_ch_mag[0][(symbol*(off+(nb_rb*12)))];
#else
ch_mag = (__m128i*)&ul_ch_mag[0][(symbol*(off+(nb_rb*12)))];
#endif
#elif defined(__arm__)
ch_mag = (int16x8_t*)&ul_ch_mag[0][(symbol*nb_rb*12)];
#endif
len_mod4 = nb_re&3;
#ifdef __AVX2__
unsigned char len_mod8 = nb_re&7;
nb_re >>= 3; // length in quad words (4 REs)
nb_re += (len_mod8 == 0 ? 0 : 1);
#else
unsigned char len_mod4 = nb_re&3;
nb_re >>= 2; // length in quad words (4 REs)
nb_re += (len_mod4 == 0 ? 0 : 1);
#endif
for (i=0; i<nb_re; i++) {
#if defined(__x86_64__) || defined(__i386)
#ifdef __AVX2__
xmm0 = _mm256_abs_epi16(rxF[i]); // registers of even index in xmm0-> |y_R|, registers of odd index in xmm0-> |y_I|
xmm0 = _mm256_subs_epi16(ch_mag[i],xmm0); // registers of even index in xmm0-> |y_R|-|h|^2, registers of odd index in xmm0-> |y_I|-|h|^2
llr256[0] = _mm256_unpacklo_epi32(rxF[i],xmm0); // llr128[0] contains the llrs of the 1st,2nd,5th and 6th REs
llr256[1] = _mm256_unpackhi_epi32(rxF[i],xmm0); // llr128[1] contains the llrs of the 3rd, 4th, 7th and 8th REs
// 1st RE
llr32[0] = _mm256_extract_epi32(llr256[0],0); // llr32[0] low 16 bits-> y_R , high 16 bits-> y_I
llr32[1] = _mm256_extract_epi32(llr256[0],1); // llr32[1] low 16 bits-> |h|-|y_R|^2, high 16 bits-> |h|-|y_I|^2
// 2nd RE
llr32[2] = _mm256_extract_epi32(llr256[0],2); // llr32[2] low 16 bits-> y_R , high 16 bits-> y_I
llr32[3] = _mm256_extract_epi32(llr256[0],3); // llr32[3] low 16 bits-> |h|-|y_R|^2, high 16 bits-> |h|-|y_I|^2
// 3rd RE
llr32[4] = _mm256_extract_epi32(llr256[1],0); // llr32[4] low 16 bits-> y_R , high 16 bits-> y_I
llr32[5] = _mm256_extract_epi32(llr256[1],1); // llr32[5] low 16 bits-> |h|-|y_R|^2, high 16 bits-> |h|-|y_I|^2
// 4th RE
llr32[6] = _mm256_extract_epi32(llr256[1],2); // llr32[6] low 16 bits-> y_R , high 16 bits-> y_I
llr32[7] = _mm256_extract_epi32(llr256[1],3); // llr32[7] low 16 bits-> |h|-|y_R|^2, high 16 bits-> |h|-|y_I|^2
// 5th RE
llr32[8] = _mm256_extract_epi32(llr256[0],4); // llr32[8] low 16 bits-> y_R , high 16 bits-> y_I
llr32[9] = _mm256_extract_epi32(llr256[0],5); // llr32[9] low 16 bits-> |h|-|y_R|^2, high 16 bits-> |h|-|y_I|^2
// 6th RE
llr32[10] = _mm256_extract_epi32(llr256[0],6); // llr32[10] low 16 bits-> y_R , high 16 bits-> y_I
llr32[11] = _mm256_extract_epi32(llr256[0],7); // llr32[11] low 16 bits-> |h|-|y_R|^2, high 16 bits-> |h|-|y_I|^2
// 7th RE
llr32[12] = _mm256_extract_epi32(llr256[1],4); // llr32[12] low 16 bits-> y_R , high 16 bits-> y_I
llr32[13] = _mm256_extract_epi32(llr256[1],5); // llr32[13] low 16 bits-> |h|-|y_R|^2, high 16 bits-> |h|-|y_I|^2
// 8th RE
llr32[14] = _mm256_extract_epi32(llr256[1],6); // llr32[14] low 16 bits-> y_R , high 16 bits-> y_I
llr32[15] = _mm256_extract_epi32(llr256[1],7); // llr32[15] low 16 bits-> |h|-|y_R|^2, high 16 bits-> |h|-|y_I|^2
llr32+=16;
#else
xmm0 = _mm_abs_epi16(rxF[i]); // registers of even index in xmm0-> |y_R|, registers of odd index in xmm0-> |y_I|
xmm0 = _mm_subs_epi16(ch_mag[i],xmm0); // registers of even index in xmm0-> |y_R|-|h|^2, registers of odd index in xmm0-> |y_I|-|h|^2
llr128[0] = _mm_unpacklo_epi32(rxF[i],xmm0); // llr128[0] contains the llrs of the 1st and 2nd REs
......@@ -138,6 +194,7 @@ void nr_ulsch_16qam_llr(int32_t *rxdataF_comp,
llr32[7] = _mm_extract_epi32(llr128[1],3); // llr32[7] low 16 bits-> |h|-|y_R|^2, high 16 bits-> |h|-|y_I|^2
llr32+=8;
#endif
#elif defined(__arm__)
xmm0 = vabsq_s16(rxF[i]);
xmm0 = vqsubq_s16((*(__m128i*)&ones[0]),xmm0);
......
Markdown is supported
0%
or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment