Commit cb10843e authored by Raymond Knopp's avatar Raymond Knopp

added AVX2 optimization for 64QAM LLR computation in NR ULSCH

parent 45ed81be
......@@ -239,10 +239,22 @@ void nr_ulsch_64qam_llr(int32_t *rxdataF_comp,
uint8_t symbol)
{
#ifdef __AVX2__
int off = ((nb_rb&1) == 1)? 4:0;
#else
int off = 0;
#endif
#if defined(__x86_64__) || defined(__i386__)
#ifdef __AVX2__
__m256i *rxF = (__m256i*)rxdataF_comp;
__m256i *ch_mag,*ch_magb;
register __m256i xmm0,xmm1,xmm2;
#else
__m128i *rxF = (__m128i*)rxdataF_comp;
__m128i *ch_mag,*ch_magb;
register __m128i xmm0,xmm1,xmm2;
#endif
#elif defined(__arm__)
int16x8_t *rxF = (int16x8_t*)&rxdataF_comp;
int16x8_t *ch_mag,*ch_magb; // [hna] This should be uncommented once channel estimation is implemented
......@@ -250,28 +262,44 @@ void nr_ulsch_64qam_llr(int32_t *rxdataF_comp,
#endif
int i;
unsigned char len_mod4;
#if defined(__x86_64__) || defined(__i386__)
#ifdef __AVX2__
ch_mag = (__m256i*)&ul_ch_mag[0][(symbol*(off+(nb_rb*12)))];
ch_magb = (__m256i*)&ul_ch_magb[0][(symbol*(off+(nb_rb*12)))];
#else
ch_mag = (__m128i*)&ul_ch_mag[0][(symbol*nb_rb*12)];
ch_magb = (__m128i*)&ul_ch_magb[0][(symbol*nb_rb*12)];
#endif
#elif defined(__arm__)
ch_mag = (int16x8_t*)&ul_ch_mag[0][(symbol*nb_rb*12)];
ch_magb = (int16x8_t*)&ul_ch_magb[0][(symbol*nb_rb*12)];
#endif
len_mod4 = nb_re&3;
#ifdef __AVX2__
int len_mod8 = nb_re&7;
nb_re = nb_re>>3; // length in quad words (4 REs)
nb_re += ((len_mod8 == 0) ? 0 : 1);
#else
int len_mod4 = nb_re&3;
nb_re = nb_re>>2; // length in quad words (4 REs)
nb_re += ((len_mod4 == 0) ? 0 : 1);
#endif
for (i=0; i<nb_re; i++) {
xmm0 = rxF[i];
#if defined(__x86_64__) || defined(__i386__)
#ifdef __AVX2__
xmm1 = _mm256_abs_epi16(xmm0);
xmm1 = _mm256_subs_epi16(ch_mag[i],xmm1);
xmm2 = _mm256_abs_epi16(xmm1);
xmm2 = _mm256_subs_epi16(ch_magb[i],xmm2);
#else
xmm1 = _mm_abs_epi16(xmm0);
xmm1 = _mm_subs_epi16(ch_mag[i],xmm1);
xmm2 = _mm_abs_epi16(xmm1);
xmm2 = _mm_subs_epi16(ch_magb[i],xmm2);
#endif
#elif defined(__arm__)
xmm1 = vabsq_s16(xmm0);
xmm1 = vsubq_s16(ch_mag[i],xmm1);
......@@ -283,12 +311,21 @@ void nr_ulsch_64qam_llr(int32_t *rxdataF_comp,
// 1st RE
// ---------------------------------------
#if defined(__x86_64__) || defined(__i386__)
#ifdef __AVX2__
ulsch_llr[0] = _mm256_extract_epi16(xmm0,0);
ulsch_llr[1] = _mm256_extract_epi16(xmm0,1);
ulsch_llr[2] = _mm256_extract_epi16(xmm1,0);
ulsch_llr[3] = _mm256_extract_epi16(xmm1,1);
ulsch_llr[4] = _mm256_extract_epi16(xmm2,0);
ulsch_llr[5] = _mm256_extract_epi16(xmm2,1);
#else
ulsch_llr[0] = _mm_extract_epi16(xmm0,0);
ulsch_llr[1] = _mm_extract_epi16(xmm0,1);
ulsch_llr[2] = _mm_extract_epi16(xmm1,0);
ulsch_llr[3] = _mm_extract_epi16(xmm1,1);
ulsch_llr[4] = _mm_extract_epi16(xmm2,0);
ulsch_llr[5] = _mm_extract_epi16(xmm2,1);
#endif
#elif defined(__arm__)
ulsch_llr[0] = vgetq_lane_s16(xmm0,0);
ulsch_llr[1] = vgetq_lane_s16(xmm0,1);
......@@ -305,12 +342,21 @@ void nr_ulsch_64qam_llr(int32_t *rxdataF_comp,
// 2nd RE
// ---------------------------------------
#if defined(__x86_64__) || defined(__i386__)
#ifdef __AVX2__
ulsch_llr[0] = _mm256_extract_epi16(xmm0,2);
ulsch_llr[1] = _mm256_extract_epi16(xmm0,3);
ulsch_llr[2] = _mm256_extract_epi16(xmm1,2);
ulsch_llr[3] = _mm256_extract_epi16(xmm1,3);
ulsch_llr[4] = _mm256_extract_epi16(xmm2,2);
ulsch_llr[5] = _mm256_extract_epi16(xmm2,3);
#else
ulsch_llr[0] = _mm_extract_epi16(xmm0,2);
ulsch_llr[1] = _mm_extract_epi16(xmm0,3);
ulsch_llr[2] = _mm_extract_epi16(xmm1,2);
ulsch_llr[3] = _mm_extract_epi16(xmm1,3);
ulsch_llr[4] = _mm_extract_epi16(xmm2,2);
ulsch_llr[5] = _mm_extract_epi16(xmm2,3);
#endif
#elif defined(__arm__)
ulsch_llr[2] = vgetq_lane_s16(xmm0,2);
ulsch_llr[3] = vgetq_lane_s16(xmm0,3);
......@@ -327,12 +373,21 @@ void nr_ulsch_64qam_llr(int32_t *rxdataF_comp,
// 3rd RE
// ---------------------------------------
#if defined(__x86_64__) || defined(__i386__)
#ifdef __AVX2__
ulsch_llr[0] = _mm256_extract_epi16(xmm0,4);
ulsch_llr[1] = _mm256_extract_epi16(xmm0,5);
ulsch_llr[2] = _mm256_extract_epi16(xmm1,4);
ulsch_llr[3] = _mm256_extract_epi16(xmm1,5);
ulsch_llr[4] = _mm256_extract_epi16(xmm2,4);
ulsch_llr[5] = _mm256_extract_epi16(xmm2,5);
#else
ulsch_llr[0] = _mm_extract_epi16(xmm0,4);
ulsch_llr[1] = _mm_extract_epi16(xmm0,5);
ulsch_llr[2] = _mm_extract_epi16(xmm1,4);
ulsch_llr[3] = _mm_extract_epi16(xmm1,5);
ulsch_llr[4] = _mm_extract_epi16(xmm2,4);
ulsch_llr[5] = _mm_extract_epi16(xmm2,5);
#endif
#elif defined(__arm__)
ulsch_llr[0] = vgetq_lane_s16(xmm0,4);
ulsch_llr[1] = vgetq_lane_s16(xmm0,5);
......@@ -349,12 +404,21 @@ void nr_ulsch_64qam_llr(int32_t *rxdataF_comp,
// 4th RE
// ---------------------------------------
#if defined(__x86_64__) || defined(__i386__)
#ifdef __AVX2__
ulsch_llr[0] = _mm256_extract_epi16(xmm0,6);
ulsch_llr[1] = _mm256_extract_epi16(xmm0,7);
ulsch_llr[2] = _mm256_extract_epi16(xmm1,6);
ulsch_llr[3] = _mm256_extract_epi16(xmm1,7);
ulsch_llr[4] = _mm256_extract_epi16(xmm2,6);
ulsch_llr[5] = _mm256_extract_epi16(xmm2,7);
#else
ulsch_llr[0] = _mm_extract_epi16(xmm0,6);
ulsch_llr[1] = _mm_extract_epi16(xmm0,7);
ulsch_llr[2] = _mm_extract_epi16(xmm1,6);
ulsch_llr[3] = _mm_extract_epi16(xmm1,7);
ulsch_llr[4] = _mm_extract_epi16(xmm2,6);
ulsch_llr[5] = _mm_extract_epi16(xmm2,7);
#endif
#elif defined(__arm__)
ulsch_llr[0] = vgetq_lane_s16(xmm0,6);
ulsch_llr[1] = vgetq_lane_s16(xmm0,7);
......@@ -366,6 +430,37 @@ void nr_ulsch_64qam_llr(int32_t *rxdataF_comp,
// ---------------------------------------
ulsch_llr+=6;
#ifdef __AVX2__
ulsch_llr[0] = _mm256_extract_epi16(xmm0,8);
ulsch_llr[1] = _mm256_extract_epi16(xmm0,9);
ulsch_llr[2] = _mm256_extract_epi16(xmm1,8);
ulsch_llr[3] = _mm256_extract_epi16(xmm1,9);
ulsch_llr[4] = _mm256_extract_epi16(xmm2,8);
ulsch_llr[5] = _mm256_extract_epi16(xmm2,9);
ulsch_llr[6] = _mm256_extract_epi16(xmm0,10);
ulsch_llr[7] = _mm256_extract_epi16(xmm0,11);
ulsch_llr[8] = _mm256_extract_epi16(xmm1,10);
ulsch_llr[9] = _mm256_extract_epi16(xmm1,11);
ulsch_llr[10] = _mm256_extract_epi16(xmm2,10);
ulsch_llr[11] = _mm256_extract_epi16(xmm2,11);
ulsch_llr[12] = _mm256_extract_epi16(xmm0,12);
ulsch_llr[13] = _mm256_extract_epi16(xmm0,13);
ulsch_llr[14] = _mm256_extract_epi16(xmm1,12);
ulsch_llr[15] = _mm256_extract_epi16(xmm1,13);
ulsch_llr[16] = _mm256_extract_epi16(xmm2,12);
ulsch_llr[17] = _mm256_extract_epi16(xmm2,13);
ulsch_llr[18] = _mm256_extract_epi16(xmm0,14);
ulsch_llr[19] = _mm256_extract_epi16(xmm0,15);
ulsch_llr[20] = _mm256_extract_epi16(xmm1,14);
ulsch_llr[21] = _mm256_extract_epi16(xmm1,15);
ulsch_llr[22] = _mm256_extract_epi16(xmm2,14);
ulsch_llr[23] = _mm256_extract_epi16(xmm2,15);
ulsch_llr+=24;
#endif
}
#if defined(__x86_64__) || defined(__i386__)
......
Markdown is supported
0%
or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment