Commit 37c1219e authored by Raymond Knopp's avatar Raymond Knopp

AVX2 optimization of 16QAM NR ULSCH LLR

parent fc21364f
......@@ -257,16 +257,22 @@ void nr_ulsch_extract_rbs_single(int32_t **rxdataF,
uint8_t is_dmrs_re=0,is_ptrs_re=0;
start_re = (frame_parms->first_carrier_offset + (pusch_pdu->rb_start * NR_NB_SC_PER_RB))%frame_parms->ofdm_symbol_size;
nb_re_pusch = NR_NB_SC_PER_RB * pusch_pdu->rb_size;
#ifdef __AVX2__
int nb_re_pusch2 = nb_re_pusch + (nb_re_pusch&7);
#else
int nb_re_pusch2 = nb_re_pusch;
#endif
num_ptrs_symbols = 0;
for (aarx = 0; aarx < frame_parms->nb_antennas_rx; aarx++) {
rxF = (int16_t *)&rxdataF[aarx][symbol * frame_parms->ofdm_symbol_size];
rxF_ext = (int16_t *)&pusch_vars->rxdataF_ext[aarx][symbol * nb_re_pusch]; // [hna] rxdataF_ext isn't contiguous in order to solve an alignment problem ib llr computation in case of mod_order = 4, 6
rxF_ext = (int16_t *)&pusch_vars->rxdataF_ext[aarx][symbol * nb_re_pusch2]; // [hna] rxdataF_ext isn't contiguous in order to solve an alignment problem ib llr computation in case of mod_order = 4, 6
ul_ch0 = &pusch_vars->ul_ch_estimates[aarx][pusch_vars->dmrs_symbol*frame_parms->ofdm_symbol_size]; // update channel estimates if new dmrs symbol are available
ul_ch0_ext = &pusch_vars->ul_ch_estimates_ext[aarx][symbol*nb_re_pusch];
ul_ch0_ext = &pusch_vars->ul_ch_estimates_ext[aarx][symbol*nb_re_pusch2];
ul_ch0_ptrs = &pusch_vars->ul_ch_ptrs_estimates[aarx][pusch_vars->ptrs_symbol_index*frame_parms->ofdm_symbol_size]; // update channel estimates if new dmrs symbol are available
......@@ -366,10 +372,16 @@ void nr_ulsch_scale_channel(int **ul_ch_estimates_ext,
ch_amp128 = _mm_set1_epi16(ch_amp); // Q3.13
#ifdef __AVX2__
int off = ((nb_rb&1) == 1)? 4:0;
#else
int off = 0;
#endif
for (aatx=0; aatx < frame_parms->nb_antenna_ports_gNB; aatx++) {
for (aarx=0; aarx < frame_parms->nb_antennas_rx; aarx++) {
ul_ch128 = (__m128i *)&ul_ch_estimates_ext[aarx][symbol*nb_rb*NR_NB_SC_PER_RB];
ul_ch128 = (__m128i *)&ul_ch_estimates_ext[aarx][symbol*(off+(nb_rb*NR_NB_SC_PER_RB))];
if (is_dmrs_symbol==1){
if (pusch_dmrs_type == pusch_dmrs_type1)
......@@ -418,12 +430,18 @@ void nr_ulsch_channel_level(int **ul_ch_estimates_ext,
int16_t x = factor2(len);
int16_t y = (len)>>x;
#ifdef __AVX2__
int off = ((nb_rb&1) == 1)? 4:0;
#else
int off = 0;
#endif
for (aatx = 0; aatx < frame_parms->nb_antennas_tx; aatx++)
for (aarx = 0; aarx < frame_parms->nb_antennas_rx; aarx++) {
//clear average level
avg128U = _mm_setzero_si128();
ul_ch128=(__m128i *)&ul_ch_estimates_ext[(aatx<<1)+aarx][symbol*nb_rb*12];
ul_ch128=(__m128i *)&ul_ch_estimates_ext[(aatx<<1)+aarx][symbol*(off+(nb_rb*12))];
for (rb = 0; rb < len/12; rb++) {
avg128U = _mm_add_epi32(avg128U, _mm_srai_epi32(_mm_madd_epi16(ul_ch128[0], ul_ch128[0]), x));
......@@ -511,13 +529,13 @@ void nr_ulsch_channel_compensation(int **rxdataF_ext,
unsigned short nb_rb,
unsigned char output_shift) {
#ifdef DEBUG_CH_COMP
int16_t *rxF, *ul_ch;
int prnt_idx;
rxF = (int16_t *)&rxdataF_ext[0][(symbol*nb_rb*12)];
ul_ch = (int16_t *)&ul_ch_estimates_ext[0][symbol*nb_rb*12];
rxF = (int16_t *)&rxdataF_ext[0][symbol*(off+(nb_rb*12))];
ul_ch = (int16_t *)&ul_ch_estimates_ext[0][symbol*(off+(nb_rb*1))2];
printf("--------------------symbol = %d, mod_order = %d, output_shift = %d-----------------------\n", symbol, mod_order, output_shift);
printf("----------------Before compensation------------------\n");
......@@ -536,7 +554,7 @@ void nr_ulsch_channel_compensation(int **rxdataF_ext,
int print_idx;
ch_mag = (int16_t *)&ul_ch_mag[0][(symbol*nb_rb*12)];
ch_mag = (int16_t *)&ul_ch_mag[0][symbol*(off+(nb_rb*12))];
printf("--------------------symbol = %d, mod_order = %d-----------------------\n", symbol, mod_order);
printf("----------------Before computation------------------\n");
......@@ -549,7 +567,13 @@ void nr_ulsch_channel_compensation(int **rxdataF_ext,
#endif
#if defined(__i386) || defined(__x86_64)
#ifdef __AVX2__
int off = ((nb_rb&1) == 1)? 4:0;
#else
int off = 0;
#endif
#if defined(__i386) || defined(__x86_64__)
unsigned short rb;
unsigned char aatx,aarx;
......@@ -571,11 +595,11 @@ void nr_ulsch_channel_compensation(int **rxdataF_ext,
for (aarx=0; aarx<frame_parms->nb_antennas_rx; aarx++) {
ul_ch128 = (__m128i *)&ul_ch_estimates_ext[(aatx<<1)+aarx][symbol*nb_rb*12];
ul_ch_mag128 = (__m128i *)&ul_ch_mag[(aatx<<1)+aarx][symbol*nb_rb*12];
ul_ch_mag128b = (__m128i *)&ul_ch_magb[(aatx<<1)+aarx][symbol*nb_rb*12];
rxdataF128 = (__m128i *)&rxdataF_ext[aarx][symbol*nb_rb*12];
rxdataF_comp128 = (__m128i *)&rxdataF_comp[(aatx<<1)+aarx][symbol*nb_rb*12];
ul_ch128 = (__m128i *)&ul_ch_estimates_ext[(aatx<<1)+aarx][symbol*(off+(nb_rb*12))];
ul_ch_mag128 = (__m128i *)&ul_ch_mag[(aatx<<1)+aarx][symbol*(off+(nb_rb*12))];
ul_ch_mag128b = (__m128i *)&ul_ch_magb[(aatx<<1)+aarx][symbol*(off+(nb_rb*12))];
rxdataF128 = (__m128i *)&rxdataF_ext[aarx][symbol*(off+(nb_rb*12))];
rxdataF_comp128 = (__m128i *)&rxdataF_comp[(aatx<<1)+aarx][symbol*(off+(nb_rb*12))];
for (rb=0; rb<nb_rb; rb++) {
......@@ -991,7 +1015,7 @@ void nr_ulsch_channel_compensation(int **rxdataF_ext,
#ifdef DEBUG_CH_COMP
rxF = (int16_t *)&rxdataF_comp[0][(symbol*nb_rb*12)];
rxF = (int16_t *)&rxdataF_comp[0][(symbol*(off+(nb_rb*12)))];
printf("----------------After compansation------------------\n");
......@@ -1006,7 +1030,7 @@ void nr_ulsch_channel_compensation(int **rxdataF_ext,
#ifdef DEBUG_CH_MAG
ch_mag = (int16_t *)&ul_ch_mag[0][(symbol*nb_rb*12)];
ch_mag = (int16_t *)&ul_ch_mag[0][(symbol*(off+(nb_rb*12)))];
printf("----------------After computation------------------\n");
......@@ -1178,7 +1202,12 @@ int nr_rx_pusch(PHY_VARS_gNB *gNB,
//----------------------------------------------------------
start_meas(&gNB->ulsch_llr_stats);
AssertFatal(gNB->pusch_vars[ulsch_id]->rxdataF_ext_offset * rel15_ul->qam_mod_order+nb_re_pusch*rel15_ul->qam_mod_order < (8*((3*8*6144)+12)) , "Mysterious llr buffer size check");
nr_ulsch_compute_llr(&gNB->pusch_vars[ulsch_id]->rxdataF_comp[0][symbol * rel15_ul->rb_size * NR_NB_SC_PER_RB],
#ifdef __AVX2__
int off = ((rel15_ul->rb_size&1) == 1)? 4:0;
#else
int off = 0;
#endif
nr_ulsch_compute_llr(&gNB->pusch_vars[ulsch_id]->rxdataF_comp[0][symbol * (off+(rel15_ul->rb_size * NR_NB_SC_PER_RB))],
gNB->pusch_vars[ulsch_id]->ul_ch_mag0,
gNB->pusch_vars[ulsch_id]->ul_ch_magb0,
&gNB->pusch_vars[ulsch_id]->llr[gNB->pusch_vars[ulsch_id]->rxdataF_ext_offset * rel15_ul->qam_mod_order],
......
......@@ -34,9 +34,6 @@
#include "PHY/sse_intrin.h"
#include "PHY/impl_defs_top.h"
__m128i xmm0 __attribute__ ((aligned(32)));
__m128i xmm1 __attribute__ ((aligned(32)));
__m128i xmm2 __attribute__ ((aligned(32)));
//----------------------------------------------------------------------------------------------
......@@ -47,20 +44,19 @@ void nr_ulsch_qpsk_llr(int32_t *rxdataF_comp,
uint32_t nb_re,
uint8_t symbol)
{
int i;
uint32_t *rxF = (uint32_t*)rxdataF_comp;
uint32_t *llr32 = (uint32_t*)ulsch_llr;
if (!llr32) {
LOG_E(PHY,"nr_ulsch_qpsk_llr: llr is null, symbol %d, llr32 = %p\n",symbol, llr32);
}
/*
for (i = 0; i < nb_re; i++) {
*llr32 = *rxF;
rxF++;
llr32++;
}
}*/
memcpy1((void*)llr32,(void*)rxF,nb_re<<2);
}
//----------------------------------------------------------------------------------------------
......@@ -76,9 +72,17 @@ void nr_ulsch_16qam_llr(int32_t *rxdataF_comp,
{
#if defined(__x86_64__) || defined(__i386__)
#ifdef __AVX2__
__m256i *rxF = (__m256i*)rxdataF_comp;
__m256i *ch_mag;
__m256i llr256[2];
register __m256i xmm0;
#else
__m128i *rxF = (__m128i*)rxdataF_comp;
__m128i *ch_mag;
__m128i llr128[2];
register __m128i xmm0;
#endif
uint32_t *llr32;
#elif defined(__arm__)
......@@ -90,7 +94,12 @@ void nr_ulsch_16qam_llr(int32_t *rxdataF_comp,
int i;
unsigned char len_mod4 = 0;
#ifdef __AVX2__
int off = ((nb_rb&1) == 1)? 4:0;
#else
int off = 0;
#endif
#if defined(__x86_64__) || defined(__i386__)
......@@ -100,22 +109,69 @@ void nr_ulsch_16qam_llr(int32_t *rxdataF_comp,
#endif
#if defined(__x86_64__) || defined(__i386__)
ch_mag = (__m128i*)&ul_ch_mag[0][(symbol*nb_rb*12)];
#ifdef __AVX2__
ch_mag = (__m256i*)&ul_ch_mag[0][(symbol*(off+(nb_rb*12)))];
#else
ch_mag = (__m128i*)&ul_ch_mag[0][(symbol*(off+(nb_rb*12)))];
#endif
#elif defined(__arm__)
ch_mag = (int16x8_t*)&ul_ch_mag[0][(symbol*nb_rb*12)];
#endif
len_mod4 = nb_re&3;
#ifdef __AVX2__
unsigned char len_mod8 = nb_re&7;
nb_re >>= 3; // length in quad words (4 REs)
nb_re += (len_mod8 == 0 ? 0 : 1);
#else
unsigned char len_mod4 = nb_re&3;
nb_re >>= 2; // length in quad words (4 REs)
nb_re += (len_mod4 == 0 ? 0 : 1);
#endif
for (i=0; i<nb_re; i++) {
#if defined(__x86_64__) || defined(__i386)
#ifdef __AVX2__
xmm0 = _mm256_abs_epi16(rxF[i]); // registers of even index in xmm0-> |y_R|, registers of odd index in xmm0-> |y_I|
xmm0 = _mm256_subs_epi16(ch_mag[i],xmm0); // registers of even index in xmm0-> |y_R|-|h|^2, registers of odd index in xmm0-> |y_I|-|h|^2
llr256[0] = _mm256_unpacklo_epi32(rxF[i],xmm0); // llr128[0] contains the llrs of the 1st,2nd,5th and 6th REs
llr256[1] = _mm256_unpackhi_epi32(rxF[i],xmm0); // llr128[1] contains the llrs of the 3rd, 4th, 7th and 8th REs
// 1st RE
llr32[0] = _mm256_extract_epi32(llr256[0],0); // llr32[0] low 16 bits-> y_R , high 16 bits-> y_I
llr32[1] = _mm256_extract_epi32(llr256[0],1); // llr32[1] low 16 bits-> |h|-|y_R|^2, high 16 bits-> |h|-|y_I|^2
// 2nd RE
llr32[2] = _mm256_extract_epi32(llr256[0],2); // llr32[2] low 16 bits-> y_R , high 16 bits-> y_I
llr32[3] = _mm256_extract_epi32(llr256[0],3); // llr32[3] low 16 bits-> |h|-|y_R|^2, high 16 bits-> |h|-|y_I|^2
// 3rd RE
llr32[4] = _mm256_extract_epi32(llr256[1],0); // llr32[4] low 16 bits-> y_R , high 16 bits-> y_I
llr32[5] = _mm256_extract_epi32(llr256[1],1); // llr32[5] low 16 bits-> |h|-|y_R|^2, high 16 bits-> |h|-|y_I|^2
// 4th RE
llr32[6] = _mm256_extract_epi32(llr256[1],2); // llr32[6] low 16 bits-> y_R , high 16 bits-> y_I
llr32[7] = _mm256_extract_epi32(llr256[1],3); // llr32[7] low 16 bits-> |h|-|y_R|^2, high 16 bits-> |h|-|y_I|^2
// 5th RE
llr32[8] = _mm256_extract_epi32(llr256[0],4); // llr32[8] low 16 bits-> y_R , high 16 bits-> y_I
llr32[9] = _mm256_extract_epi32(llr256[0],5); // llr32[9] low 16 bits-> |h|-|y_R|^2, high 16 bits-> |h|-|y_I|^2
// 6th RE
llr32[10] = _mm256_extract_epi32(llr256[0],6); // llr32[10] low 16 bits-> y_R , high 16 bits-> y_I
llr32[11] = _mm256_extract_epi32(llr256[0],7); // llr32[11] low 16 bits-> |h|-|y_R|^2, high 16 bits-> |h|-|y_I|^2
// 7th RE
llr32[12] = _mm256_extract_epi32(llr256[1],4); // llr32[12] low 16 bits-> y_R , high 16 bits-> y_I
llr32[13] = _mm256_extract_epi32(llr256[1],5); // llr32[13] low 16 bits-> |h|-|y_R|^2, high 16 bits-> |h|-|y_I|^2
// 8th RE
llr32[14] = _mm256_extract_epi32(llr256[1],6); // llr32[14] low 16 bits-> y_R , high 16 bits-> y_I
llr32[15] = _mm256_extract_epi32(llr256[1],7); // llr32[15] low 16 bits-> |h|-|y_R|^2, high 16 bits-> |h|-|y_I|^2
llr32+=16;
#else
xmm0 = _mm_abs_epi16(rxF[i]); // registers of even index in xmm0-> |y_R|, registers of odd index in xmm0-> |y_I|
xmm0 = _mm_subs_epi16(ch_mag[i],xmm0); // registers of even index in xmm0-> |y_R|-|h|^2, registers of odd index in xmm0-> |y_I|-|h|^2
llr128[0] = _mm_unpacklo_epi32(rxF[i],xmm0); // llr128[0] contains the llrs of the 1st and 2nd REs
......@@ -138,6 +194,7 @@ void nr_ulsch_16qam_llr(int32_t *rxdataF_comp,
llr32[7] = _mm_extract_epi32(llr128[1],3); // llr32[7] low 16 bits-> |h|-|y_R|^2, high 16 bits-> |h|-|y_I|^2
llr32+=8;
#endif
#elif defined(__arm__)
xmm0 = vabsq_s16(rxF[i]);
xmm0 = vqsubq_s16((*(__m128i*)&ones[0]),xmm0);
......
Markdown is supported
0%
or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment