Fix for nr_ulsch_16/64/256qam_llr reading/writing past buffer size on x86 and ARM.

This fixes out-of-bounds access: nr_ulsch_16qam_llr, nr_ulsch_64qam_llr and nr_ulsch_256qam_llr. For 256qam this also fixes incorrect llr calculation on arm: the existing AVX code for 2 REs case produced results not in line with the rest of the code. A testcase check_2_res_256_qam was added to visualise the differences which can later be used to revive AVX acceleration for 2REs case.

Fix for nr_ulsch_16/64/256qam_llr reading/writing past buffer size on x86 and ARM.
This fixes out-of-bounds access: nr_ulsch_16qam_llr, nr_ulsch_64qam_llr and nr_ulsch_256qam_llr. For 256qam this also fixes incorrect llr calculation on arm: the existing AVX code for 2 REs case produced results not in line with the rest of the code. A testcase check_2_res_256_qam was added to visualise the differences which can later be used to revive AVX acceleration for 2REs case.
bbd6f35a · Bartosz Podrygajlo · 871078a6 · bbd6f35a · bbd6f35a · bbd6f35a
Commit bbd6f35a authored Aug 02, 2024 by Bartosz Podrygajlo
5 changed files
--- a/openair1/PHY/CMakeLists.txt
+++ b/openair1/PHY/CMakeLists.txt
 add_subdirectory(nr_phy_common)
 add_subdirectory(TOOLS)
+add_subdirectory(NR_TRANSPORT)
--- a/openair1/PHY/NR_TRANSPORT/CMakeLists.txt
+++ b/openair1/PHY/NR_TRANSPORT/CMakeLists.txt
+if (ENABLE_TESTS)
+  add_subdirectory(tests)
+endif()
--- a/openair1/PHY/NR_TRANSPORT/nr_ulsch_llr_computation.c
+++ b/openair1/PHY/NR_TRANSPORT/nr_ulsch_llr_computation.c
@@ -39,6 +39,19 @@
 #define USE_128BIT
 #endif

+int16_t saturating_sub(int16_t a, int16_t b)
+{
+  int32_t result = (int32_t)a - (int32_t)b;
+
+  if (result < INT16_MIN) {
+    return INT16_MIN;
+  } else if (result > INT16_MAX) {
+    return INT16_MAX;
+  } else {
+    return (int16_t)result;
+  }
+}
+
 //----------------------------------------------------------------------------------------------
 // QPSK
 //----------------------------------------------------------------------------------------------
@@ -59,48 +72,27 @@ void nr_ulsch_qpsk_llr(int32_t *rxdataF_comp,
 // 16-QAM
 //----------------------------------------------------------------------------------------------

-void nr_ulsch_16qam_llr(int32_t *rxdataF_comp,
-                        int32_t *ul_ch_mag,
-                        int16_t  *ulsch_llr,
-                        uint32_t nb_re,
-                        uint8_t  symbol)
+void nr_ulsch_16qam_llr(int32_t *rxdataF_comp, int32_t *ul_ch_mag, int16_t *ulsch_llr, uint32_t nb_re, uint8_t symbol)
 {
-#ifdef USE_128BIT
-  simde__m128i *rxF = (simde__m128i*)rxdataF_comp;
-  simde__m128i *ch_mag;
-  simde__m128i *ulsch_llr_128 = (simde__m128i*) ulsch_llr;
-  int i;
-
-  ch_mag = (simde__m128i*)ul_ch_mag;
-  nb_re >>= 2;  // length in quad words (4 REs)
-  nb_re += ((nb_re&3) == 0 ? 0 : 1);
-
-  // Each iteration does 4 RE (gives 16 16bit-llrs)
-  for (i=0; i<nb_re; i++) {
-    simde__m128i xmm0 = simde_mm_abs_epi16(rxF[i]); // registers of even index in xmm0-> |y_R|, registers of odd index in xmm0-> |y_I|
-    xmm0 = simde_mm_subs_epi16(ch_mag[i],xmm0); // registers of even index in xmm0-> |y_R|-|h|^2, registers of odd index in xmm0-> |y_I|-|h|^2
-
-    ulsch_llr_128[0] = simde_mm_unpacklo_epi32(rxF[i],xmm0); // llr128[0] contains the llrs of the 1st,2nd,5th and 6th REs
-    ulsch_llr_128[1] = simde_mm_unpackhi_epi32(rxF[i],xmm0); // llr128[1] contains the llrs of the 3rd, 4th, 7th and 8th REs
-    ulsch_llr_128 += 2;
-  }
-#else
-  simde__m256i *rxF_256 = (simde__m256i*)rxdataF_comp;
-  simde__m256i *ch_mag = (simde__m256i*)ul_ch_mag;
-  int64_t *llr_64 = (int64_t*)ulsch_llr;
+  simde__m256i *rxF_256 = (simde__m256i *)rxdataF_comp;
+  simde__m256i *ch_mag = (simde__m256i *)ul_ch_mag;
+  int64_t *llr_64 = (int64_t *)ulsch_llr;

+#ifndef USE_128BIT
  simde__m256i xmm0, xmm1, xmm2;

-  for (int i = 0; i < ((nb_re + 7) >> 3); i++) {
-    xmm0 = simde_mm256_abs_epi16(rxF_256[i]);       // registers of even index in xmm0-> |y_R|, registers of odd index in xmm0-> |y_I|
-    xmm0 = simde_mm256_subs_epi16(ch_mag[i], xmm0); // registers of even index in xmm0-> |y_R|-|h|^2, registers of odd index in xmm0-> |y_I|-|h|^2
- 
-    xmm1 = simde_mm256_unpacklo_epi32(rxF_256[i], xmm0);
-    xmm2 = simde_mm256_unpackhi_epi32(rxF_256[i], xmm0);
+  for (int i = 0; i < (nb_re >> 3); i++) {
+    // registers of even index in xmm0-> |y_R|, registers of odd index in xmm0-> |y_I|
+    xmm0 = simde_mm256_abs_epi16(*rxF_256);
+    // registers of even index in xmm0-> |y_R|-|h|^2, registers of odd index in xmm0-> |y_I|-|h|^2
+    xmm0 = simde_mm256_subs_epi16(*ch_mag, xmm0);
+
+    xmm1 = simde_mm256_unpacklo_epi32(*rxF_256, xmm0);
+    xmm2 = simde_mm256_unpackhi_epi32(*rxF_256, xmm0);

    // xmm1 |1st 2ed 3rd 4th  9th 10th 13rd 14th|
    // xmm2 |5th 6th 7th 8th 11st 12ed 15th 16th|
-    
+
    *llr_64++ = simde_mm256_extract_epi64(xmm1, 0);
    *llr_64++ = simde_mm256_extract_epi64(xmm1, 1);
    *llr_64++ = simde_mm256_extract_epi64(xmm2, 0);
@@ -109,9 +101,47 @@ void nr_ulsch_16qam_llr(int32_t *rxdataF_comp,
    *llr_64++ = simde_mm256_extract_epi64(xmm1, 3);
    *llr_64++ = simde_mm256_extract_epi64(xmm2, 2);
    *llr_64++ = simde_mm256_extract_epi64(xmm2, 3);
+    rxF_256++;
+    ch_mag++;
  }
+
+  nb_re &= 0x7;
 #endif
+
+  simde__m128i *rxF_128 = (simde__m128i *)rxF_256;
+  simde__m128i *ch_mag_128 = (simde__m128i *)ch_mag;
+  simde__m128i *ulsch_llr_128 = (simde__m128i *)llr_64;
+
+  // Each iteration does 4 RE (gives 16 16bit-llrs)
+  for (int i = 0; i < (nb_re >> 2); i++) {
+    // registers of even index in xmm0-> |y_R|, registers of odd index in xmm0-> |y_I|
+    simde__m128i xmm0 = simde_mm_abs_epi16(*rxF_128);
+    // registers of even index in xmm0-> |y_R|-|h|^2, registers of odd index in xmm0-> |y_I|-|h|^2
+    xmm0 = simde_mm_subs_epi16(*ch_mag_128, xmm0);
+
+    ulsch_llr_128[0] = simde_mm_unpacklo_epi32(*rxF_128, xmm0); // llr128[0] contains the llrs of the 1st,2nd,5th and 6th REs
+    ulsch_llr_128[1] = simde_mm_unpackhi_epi32(*rxF_128, xmm0); // llr128[1] contains the llrs of the 3rd, 4th, 7th and 8th REs
+    ulsch_llr_128 += 2;
+    rxF_128++;
+    ch_mag_128++;
+  }
+
  simde_mm_empty();
+
+  nb_re &= 0x3;
+  int16_t *rxDataF_i16 = (int16_t *)rxF_128;
+  int16_t *ul_ch_mag_i16 = (int16_t *)ch_mag_128;
+  int16_t *ulsch_llr_i16 = (int16_t *)ulsch_llr_128;
+  for (uint i = 0U; i < nb_re; i++) {
+    int16_t real = rxDataF_i16[2 * i];
+    int16_t imag = rxDataF_i16[2 * i + 1];
+    int16_t mag_real = ul_ch_mag_i16[2 * i];
+    int16_t mag_imag = ul_ch_mag_i16[2 * i + 1];
+    ulsch_llr_i16[4 * i] = real;
+    ulsch_llr_i16[4 * i + 1] = imag;
+    ulsch_llr_i16[4 * i + 2] = saturating_sub(mag_real, abs(real));
+    ulsch_llr_i16[4 * i + 3] = saturating_sub(mag_imag, abs(imag));
+  }
 }

 //----------------------------------------------------------------------------------------------
@@ -121,184 +151,154 @@ void nr_ulsch_16qam_llr(int32_t *rxdataF_comp,
 void nr_ulsch_64qam_llr(int32_t *rxdataF_comp,
                        int32_t *ul_ch_mag,
                        int32_t *ul_ch_magb,
-                        int16_t  *ulsch_llr,
+                        int16_t *ulsch_llr,
                        uint32_t nb_re,
-                        uint8_t  symbol)
+                        uint8_t symbol)
 {
-#ifdef USE_128BIT
-  simde__m128i *rxF = (simde__m128i*)rxdataF_comp;
-  simde__m128i *ch_mag  = (simde__m128i*)ul_ch_mag;
-  simde__m128i *ch_magb = (simde__m128i*)ul_ch_magb;
-  int i;
-
-  nb_re    = nb_re>>2;  // length in 128-bit words (4 REs)
-  nb_re   += ((nb_re&3) == 0 ? 0 : 1);
-  simde__m64 *llr64 = (simde__m64 *) ulsch_llr;
-
-  // Each iteration does 4 RE (gives 24 16bit-llrs)
-  for (i=0; i<nb_re; i++) {
-    simde__m128i xmm0, xmm1, xmm2;
-    xmm0 = rxF[i];
-    xmm1 = simde_mm_abs_epi16(xmm0);
-    xmm1 = simde_mm_subs_epi16(ch_mag[i],xmm1);
-    xmm2 = simde_mm_abs_epi16(xmm1);
-    xmm2 = simde_mm_subs_epi16(ch_magb[i],xmm2);
-
-    llr64[0] = simde_mm_set_pi32(simde_mm_extract_epi32(xmm1, 0), simde_mm_extract_epi32(xmm0, 0));
-    llr64[1] = simde_mm_set_pi32(simde_mm_extract_epi32(xmm0, 1), simde_mm_extract_epi32(xmm2, 0));
-    llr64[2] = simde_mm_set_pi32(simde_mm_extract_epi32(xmm2, 1), simde_mm_extract_epi32(xmm1, 1));
-    llr64[3] = simde_mm_set_pi32(simde_mm_extract_epi32(xmm1, 2), simde_mm_extract_epi32(xmm0, 2));
-    llr64[4] = simde_mm_set_pi32(simde_mm_extract_epi32(xmm0, 3), simde_mm_extract_epi32(xmm2, 2));
-    llr64[5] = simde_mm_set_pi32(simde_mm_extract_epi32(xmm2, 3), simde_mm_extract_epi32(xmm1, 3));
-    llr64 += 6;
-  }
-#else
-  simde__m256i *rxF = (simde__m256i*)rxdataF_comp;
-  simde__m256i xmm0, xmm1, xmm2;
+  simde__m256i *rxF = (simde__m256i *)rxdataF_comp;

-  simde__m256i *ch_maga = (simde__m256i*)ul_ch_mag;
-  simde__m256i *ch_magb = (simde__m256i*)ul_ch_magb;
+  simde__m256i *ch_maga = (simde__m256i *)ul_ch_mag;
+  simde__m256i *ch_magb = (simde__m256i *)ul_ch_magb;

  int32_t *llr_32 = (int32_t *)ulsch_llr;

-  for (int i = 0; i < ((nb_re + 7) >> 3); i++) {
-    xmm0 = rxF[i];
-    xmm1 = simde_mm256_abs_epi16(xmm0);              // registers of even index in xmm0-> |y_R|, registers of odd index in xmm0-> |y_I|
-    xmm1 = simde_mm256_subs_epi16(ch_maga[i], xmm1); // registers of even index in xmm0-> |y_R|-|h|^2, registers of odd index in xmm0-> |y_I|-|h|^2
+#ifndef USE_128BIT
+  simde__m256i xmm0, xmm1, xmm2;
+  for (int i = 0; i < (nb_re >> 3); i++) {
+    xmm0 = *rxF;
+    // registers of even index in xmm0-> |y_R|, registers of odd index in xmm0-> |y_I|
+    xmm1 = simde_mm256_abs_epi16(xmm0);
+    // registers of even index in xmm0-> |y_R|-|h|^2, registers of odd index in xmm0-> |y_I|-|h|^2
+    xmm1 = simde_mm256_subs_epi16(*ch_maga, xmm1);
    xmm2 = simde_mm256_abs_epi16(xmm1);
-    xmm2 = simde_mm256_subs_epi16(ch_magb[i], xmm2);
+    xmm2 = simde_mm256_subs_epi16(*ch_magb, xmm2);
    // xmm0 |1st 4th 7th 10th 13th 16th 19th 22ed|
    // xmm1 |2ed 5th 8th 11th 14th 17th 20th 23rd|
    // xmm2 |3rd 6th 9th 12th 15th 18th 21st 24th|

-    *llr_32++ = simde_mm256_extract_epi32(xmm0,0);
-    *llr_32++ = simde_mm256_extract_epi32(xmm1,0);
-    *llr_32++ = simde_mm256_extract_epi32(xmm2,0);
+    *llr_32++ = simde_mm256_extract_epi32(xmm0, 0);
+    *llr_32++ = simde_mm256_extract_epi32(xmm1, 0);
+    *llr_32++ = simde_mm256_extract_epi32(xmm2, 0);

-    *llr_32++ = simde_mm256_extract_epi32(xmm0,1);
-    *llr_32++ = simde_mm256_extract_epi32(xmm1,1);
-    *llr_32++ = simde_mm256_extract_epi32(xmm2,1);
+    *llr_32++ = simde_mm256_extract_epi32(xmm0, 1);
+    *llr_32++ = simde_mm256_extract_epi32(xmm1, 1);
+    *llr_32++ = simde_mm256_extract_epi32(xmm2, 1);

-    *llr_32++ = simde_mm256_extract_epi32(xmm0,2);
-    *llr_32++ = simde_mm256_extract_epi32(xmm1,2);
-    *llr_32++ = simde_mm256_extract_epi32(xmm2,2);
+    *llr_32++ = simde_mm256_extract_epi32(xmm0, 2);
+    *llr_32++ = simde_mm256_extract_epi32(xmm1, 2);
+    *llr_32++ = simde_mm256_extract_epi32(xmm2, 2);

-    *llr_32++ = simde_mm256_extract_epi32(xmm0,3);
-    *llr_32++ = simde_mm256_extract_epi32(xmm1,3);
-    *llr_32++ = simde_mm256_extract_epi32(xmm2,3);
+    *llr_32++ = simde_mm256_extract_epi32(xmm0, 3);
+    *llr_32++ = simde_mm256_extract_epi32(xmm1, 3);
+    *llr_32++ = simde_mm256_extract_epi32(xmm2, 3);

-    *llr_32++ = simde_mm256_extract_epi32(xmm0,4);
-    *llr_32++ = simde_mm256_extract_epi32(xmm1,4);
-    *llr_32++ = simde_mm256_extract_epi32(xmm2,4);
+    *llr_32++ = simde_mm256_extract_epi32(xmm0, 4);
+    *llr_32++ = simde_mm256_extract_epi32(xmm1, 4);
+    *llr_32++ = simde_mm256_extract_epi32(xmm2, 4);

-    *llr_32++ = simde_mm256_extract_epi32(xmm0,5);
-    *llr_32++ = simde_mm256_extract_epi32(xmm1,5);
-    *llr_32++ = simde_mm256_extract_epi32(xmm2,5);
+    *llr_32++ = simde_mm256_extract_epi32(xmm0, 5);
+    *llr_32++ = simde_mm256_extract_epi32(xmm1, 5);
+    *llr_32++ = simde_mm256_extract_epi32(xmm2, 5);

-    *llr_32++ = simde_mm256_extract_epi32(xmm0,6);
-    *llr_32++ = simde_mm256_extract_epi32(xmm1,6);
-    *llr_32++ = simde_mm256_extract_epi32(xmm2,6);
+    *llr_32++ = simde_mm256_extract_epi32(xmm0, 6);
+    *llr_32++ = simde_mm256_extract_epi32(xmm1, 6);
+    *llr_32++ = simde_mm256_extract_epi32(xmm2, 6);

-    *llr_32++ = simde_mm256_extract_epi32(xmm0,7);
-    *llr_32++ = simde_mm256_extract_epi32(xmm1,7);
-    *llr_32++ = simde_mm256_extract_epi32(xmm2,7);
+    *llr_32++ = simde_mm256_extract_epi32(xmm0, 7);
+    *llr_32++ = simde_mm256_extract_epi32(xmm1, 7);
+    *llr_32++ = simde_mm256_extract_epi32(xmm2, 7);
+    rxF++;
+    ch_maga++;
+    ch_magb++;
  }
+
+  nb_re &= 0x7;
 #endif
-  simde_mm_empty();
-}

-void nr_ulsch_256qam_llr(int32_t *rxdataF_comp,
-                         int32_t *ul_ch_mag,
-                         int32_t *ul_ch_magb,
-                         int32_t *ul_ch_magc,
-                         int16_t  *ulsch_llr,
-                         uint32_t nb_re,
-                         uint8_t  symbol)
-{
-#ifdef USE_128BIT
-  simde__m128i *rxF = (simde__m128i*)rxdataF_comp;
-  simde__m128i *llr128=(simde__m128i*)ulsch_llr;
+  simde__m128i *rxF_128 = (simde__m128i *)rxF;
+  simde__m128i *ch_mag_128 = (simde__m128i *)ch_maga;
+  simde__m128i *ch_magb_128 = (simde__m128i *)ch_magb;

-  simde__m128i* ch_mag  = (simde__m128i*)ul_ch_mag;
-  simde__m128i* ch_magb = (simde__m128i*)ul_ch_magb;
-  simde__m128i* ch_magc = (simde__m128i*)ul_ch_magc;
-  int len_mod4 = nb_re & 3;
-  int nb_re128 = nb_re >> 2;  // length in 128-bit words (4 REs)
+  simde__m64 *llr64 = (simde__m64 *)llr_32;

-  for (int i=0; i<nb_re128; i++) {
-    simde__m128i xmm0, xmm1, xmm2, xmm3, xmm4, xmm5, xmm6;
-    xmm0 = simde_mm_abs_epi16(rxF[i]);           // registers of even index in xmm0-> |y_R|, registers of odd index in xmm0-> |y_I|
-    xmm0 = simde_mm_subs_epi16(ch_mag[i], xmm0); // registers of even index in xmm0-> |y_R|-|h|^2, registers of odd index in xmm0-> |y_I|-|h|^2
+  // Each iteration does 4 RE (gives 24 16bit-llrs)
+  for (int i = 0; i < (nb_re >> 2); i++) {
+    simde__m128i xmm0, xmm1, xmm2;
+    xmm0 = *rxF_128;
    xmm1 = simde_mm_abs_epi16(xmm0);
-    xmm1 = simde_mm_subs_epi16(ch_magb[i], xmm1); // contains 8 LLRs
+    xmm1 = simde_mm_subs_epi16(*ch_mag_128, xmm1);
    xmm2 = simde_mm_abs_epi16(xmm1);
-    xmm2 = simde_mm_subs_epi16(ch_magc[i], xmm2); // contains 8 LLRs
-    // rxF[i] A0 A1 A2 A3
-    // xmm0   B0 B1 B2 B3
-    // xmm1   C0 C1 C2 C3
-    // xmm2   D0 D1 D2 D3
-    xmm3 = simde_mm_unpacklo_epi32(rxF[i], xmm0);   // A0 B0 A1 B1
-    xmm4 = simde_mm_unpackhi_epi32(rxF[i], xmm0);   // A2 B2 A3 B3
-    xmm5 = simde_mm_unpacklo_epi32(xmm1, xmm2);     // C0 D0 C1 D1
-    xmm6 = simde_mm_unpackhi_epi32(xmm1, xmm2);     // C2 D2 C3 D3
-
-    llr128[0] = simde_mm_unpacklo_epi64(xmm3,xmm5); // A0 B0 C0 D0
-    llr128[1] = simde_mm_unpackhi_epi64(xmm3,xmm5); // A1 B1 C1 D1
-    llr128[2] = simde_mm_unpacklo_epi64(xmm4,xmm6); // A2 B2 C2 D2
-    llr128[3] = simde_mm_unpackhi_epi64(xmm4,xmm6); // A3 B3 C3 D3
-    llr128+=4;
+    xmm2 = simde_mm_subs_epi16(*ch_magb_128, xmm2);
+
+    *llr64++ = simde_mm_set_pi32(simde_mm_extract_epi32(xmm1, 0), simde_mm_extract_epi32(xmm0, 0));
+    *llr64++ = simde_mm_set_pi32(simde_mm_extract_epi32(xmm0, 1), simde_mm_extract_epi32(xmm2, 0));
+    *llr64++ = simde_mm_set_pi32(simde_mm_extract_epi32(xmm2, 1), simde_mm_extract_epi32(xmm1, 1));
+    *llr64++ = simde_mm_set_pi32(simde_mm_extract_epi32(xmm1, 2), simde_mm_extract_epi32(xmm0, 2));
+    *llr64++ = simde_mm_set_pi32(simde_mm_extract_epi32(xmm0, 3), simde_mm_extract_epi32(xmm2, 2));
+    *llr64++ = simde_mm_set_pi32(simde_mm_extract_epi32(xmm2, 3), simde_mm_extract_epi32(xmm1, 3));
+    rxF_128++;
+    ch_mag_128++;
+    ch_magb_128++;
  }

-  if (len_mod4) {
-    int last_2_re = (nb_re >> 1) - 1;
-    simde__m64 *llr64 = (simde__m64 *)llr128;
-    simde__m64 xmm0,xmm1,xmm2;
-    simde__m64 *rxF = (simde__m64*)rxdataF_comp;
-    simde__m64 *ch_mag  = (simde__m64*)ul_ch_mag;
-    simde__m64 *ch_magb = (simde__m64*)ul_ch_magb;
-    simde__m64 *ch_magc = (simde__m64*)ul_ch_magc;
-
-    xmm0 = simde_mm_abs_pi16(rxF[last_2_re]); // registers of even index in xmm0-> |y_R|, registers of odd index in xmm0-> |y_I|
-    xmm0 = simde_mm_subs_pi16(ch_mag[last_2_re],xmm0); // registers of even index in xmm0-> |y_R|-|h|^2, registers of odd index in xmm0-> |y_I|-|h|^2
-    //  xmmtmpD2 contains 4 LLRs
-    xmm1 = simde_mm_abs_pi16(xmm0);
-    xmm1 = simde_mm_subs_pi16(ch_magb[last_2_re],xmm1); // contains 4 LLRs
-    xmm2 = simde_mm_abs_pi16(xmm1);
-    xmm2 = simde_mm_subs_pi16(ch_magc[last_2_re],xmm2); // contains 4 LLRs
-    // rxF[i] A0 A1
-    // xmm0   B0 B1
-    // xmm1   C0 C1
-    // xmm2   D0 D1
-    llr64[0] = simde_mm_unpacklo_pi32(rxF[last_2_re],xmm0);  // A0 B0
-    llr64[2] = simde_mm_unpackhi_pi32(rxF[last_2_re],xmm0);  // A1 B1
-    llr64[1] = simde_mm_unpacklo_pi32(xmm1,xmm2);            // C0 D0
-    llr64[3] = simde_mm_unpackhi_pi32(xmm1,xmm2);            // C1 D1
-  }
-#else
-  simde__m256i *rxF = (simde__m256i*)rxdataF_comp;
-  simde__m256i xmm0,xmm1,xmm2,xmm3,xmm4,xmm5,xmm6;
-  simde__m256i *llr256=(simde__m256i*)ulsch_llr;
+  nb_re &= 0x3;

-  simde__m256i* ch_maga = (simde__m256i*)ul_ch_mag;
-  simde__m256i* ch_magb = (simde__m256i*)ul_ch_magb;
-  simde__m256i* ch_magc = (simde__m256i*)ul_ch_magc;
+  int16_t *rxDataF_i16 = (int16_t *)rxF_128;
+  int16_t *ul_ch_mag_i16 = (int16_t *)ch_mag_128;
+  int16_t *ul_ch_magb_i16 = (int16_t *)ch_magb_128;
+  int16_t *llr_i16 = (int16_t *)llr64;
+  for (int i = 0; i < nb_re; i++) {
+    int16_t real = rxDataF_i16[2 * i];
+    int16_t imag = rxDataF_i16[2 * i + 1];
+    int16_t mag_real = ul_ch_mag_i16[2 * i];
+    int16_t mag_imag = ul_ch_mag_i16[2 * i + 1];
+    llr_i16[6 * i] = real;
+    llr_i16[6 * i + 1] = imag;
+    llr_i16[6 * i + 2] = saturating_sub(mag_real, abs(real));
+    llr_i16[6 * i + 3] = saturating_sub(mag_imag, abs(imag));
+    int16_t mag_realb = ul_ch_magb_i16[2 * i];
+    int16_t mag_imagb = ul_ch_magb_i16[2 * i + 1];
+    llr_i16[6 * i + 4] = saturating_sub(mag_realb, abs(llr_i16[6 * i + 2]));
+    llr_i16[6 * i + 5] = saturating_sub(mag_imagb, abs(llr_i16[6 * i + 3]));
+  }
+  simde_mm_empty();
+}

-  for (int i = 0; i < ((nb_re + 7) >> 3); i++) {
-    xmm0 = simde_mm256_abs_epi16(rxF[i]); // registers of even index in xmm0-> |y_R|, registers of odd index in xmm0-> |y_I|
-    xmm0 = simde_mm256_subs_epi16(ch_maga[i], xmm0); // registers of even index in xmm0-> |y_R|-|h|^2, registers of odd index in xmm0-> |y_I|-|h|^2
+void nr_ulsch_256qam_llr(int32_t *rxdataF_comp,
+                         int32_t *ul_ch_mag,
+                         int32_t *ul_ch_magb,
+                         int32_t *ul_ch_magc,
+                         int16_t *ulsch_llr,
+                         uint32_t nb_re,
+                         uint8_t symbol)
+{
+  simde__m256i *rxF_256 = (simde__m256i *)rxdataF_comp;
+  simde__m256i *llr256 = (simde__m256i *)ulsch_llr;
+
+  simde__m256i *ch_maga = (simde__m256i *)ul_ch_mag;
+  simde__m256i *ch_magb = (simde__m256i *)ul_ch_magb;
+  simde__m256i *ch_magc = (simde__m256i *)ul_ch_magc;
+#ifndef USE_128BIT
+  simde__m256i xmm0, xmm1, xmm2, xmm3, xmm4, xmm5, xmm6;
+
+  for (int i = 0; i < (nb_re >> 3); i++) {
+    // registers of even index in xmm0-> |y_R|, registers of odd index in xmm0-> |y_I|
+    xmm0 = simde_mm256_abs_epi16(*rxF_256);
+    // registers of even index in xmm0-> |y_R|-|h|^2, registers of odd index in xmm0-> |y_I|-|h|^2
+    xmm0 = simde_mm256_subs_epi16(*ch_maga, xmm0);
    //  xmmtmpD2 contains 16 LLRs
    xmm1 = simde_mm256_abs_epi16(xmm0);
-    xmm1 = simde_mm256_subs_epi16(ch_magb[i], xmm1); // contains 16 LLRs
+    xmm1 = simde_mm256_subs_epi16(*ch_magb, xmm1); // contains 16 LLRs
    xmm2 = simde_mm256_abs_epi16(xmm1);
-    xmm2 = simde_mm256_subs_epi16(ch_magc[i], xmm2); // contains 16 LLRs
+    xmm2 = simde_mm256_subs_epi16(*ch_magc, xmm2); // contains 16 LLRs
    // rxF[i] A0 A1 A2 A3 A4 A5 A6 A7 bits 7,6
    // xmm0   B0 B1 B2 B3 B4 B5 B6 B7 bits 5,4
    // xmm1   C0 C1 C2 C3 C4 C5 C6 C7 bits 3,2
    // xmm2   D0 D1 D2 D3 D4 D5 D6 D7 bits 1,0
-    xmm3 = simde_mm256_unpacklo_epi32(rxF[i], xmm0); // A0 B0 A1 B1 A4 B4 A5 B5
-    xmm4 = simde_mm256_unpackhi_epi32(rxF[i], xmm0); // A2 B2 A3 B3 A6 B6 A7 B7
-    xmm5 = simde_mm256_unpacklo_epi32(xmm1, xmm2);   // C0 D0 C1 D1 C4 D4 C5 D5
-    xmm6 = simde_mm256_unpackhi_epi32(xmm1, xmm2);   // C2 D2 C3 D3 C6 D6 C7 D7
+    xmm3 = simde_mm256_unpacklo_epi32(*rxF_256, xmm0); // A0 B0 A1 B1 A4 B4 A5 B5
+    xmm4 = simde_mm256_unpackhi_epi32(*rxF_256, xmm0); // A2 B2 A3 B3 A6 B6 A7 B7
+    xmm5 = simde_mm256_unpacklo_epi32(xmm1, xmm2); // C0 D0 C1 D1 C4 D4 C5 D5
+    xmm6 = simde_mm256_unpackhi_epi32(xmm1, xmm2); // C2 D2 C3 D3 C6 D6 C7 D7

    xmm0 = simde_mm256_unpacklo_epi64(xmm3, xmm5); // A0 B0 C0 D0 A4 B4 C4 D4
    xmm1 = simde_mm256_unpackhi_epi64(xmm3, xmm5); // A1 B1 C1 D1 A5 B5 C5 D5
@@ -308,8 +308,77 @@ void nr_ulsch_256qam_llr(int32_t *rxdataF_comp,
    *llr256++ = simde_mm256_permute2x128_si256(xmm2, xmm3, 0x20); // A2 B2 C2 D2 A3 B3 C3 D3
    *llr256++ = simde_mm256_permute2x128_si256(xmm0, xmm1, 0x31); // A4 B4 C4 D4 A5 B5 C5 D5
    *llr256++ = simde_mm256_permute2x128_si256(xmm2, xmm3, 0x31); // A6 B6 C6 D6 A7 B7 C7 D7
+    ch_magc++;
+    ch_magb++;
+    ch_maga++;
+    rxF_256++;
  }
+
+  nb_re &= 0x7;
 #endif
+
+  simde__m128i *rxF_128 = (simde__m128i *)rxF_256;
+  simde__m128i *llr_128 = (simde__m128i *)llr256;
+
+  simde__m128i *ch_maga_128 = (simde__m128i *)ch_maga;
+  simde__m128i *ch_magb_128 = (simde__m128i *)ch_magb;
+  simde__m128i *ch_magc_128 = (simde__m128i *)ch_magc;
+
+  for (int i = 0; i < (nb_re >> 2); i++) {
+    simde__m128i xmm0, xmm1, xmm2, xmm3, xmm4, xmm5, xmm6;
+    // registers of even index in xmm0-> |y_R|, registers of odd index in xmm0-> |y_I|
+    xmm0 = simde_mm_abs_epi16(*rxF_128);
+    // registers of even index in xmm0-> |y_R|-|h|^2, registers of odd index in xmm0-> |y_I|-|h|^2
+    xmm0 = simde_mm_subs_epi16(*ch_maga_128, xmm0);
+    xmm1 = simde_mm_abs_epi16(xmm0);
+    xmm1 = simde_mm_subs_epi16(*ch_magb_128, xmm1); // contains 8 LLRs
+    xmm2 = simde_mm_abs_epi16(xmm1);
+    xmm2 = simde_mm_subs_epi16(*ch_magc_128, xmm2); // contains 8 LLRs
+    // rxF[i] A0 A1 A2 A3
+    // xmm0   B0 B1 B2 B3
+    // xmm1   C0 C1 C2 C3
+    // xmm2   D0 D1 D2 D3
+    xmm3 = simde_mm_unpacklo_epi32(*rxF_128, xmm0); // A0 B0 A1 B1
+    xmm4 = simde_mm_unpackhi_epi32(*rxF_128, xmm0); // A2 B2 A3 B3
+    xmm5 = simde_mm_unpacklo_epi32(xmm1, xmm2); // C0 D0 C1 D1
+    xmm6 = simde_mm_unpackhi_epi32(xmm1, xmm2); // C2 D2 C3 D3
+
+    *llr_128++ = simde_mm_unpacklo_epi64(xmm3, xmm5); // A0 B0 C0 D0
+    *llr_128++ = simde_mm_unpackhi_epi64(xmm3, xmm5); // A1 B1 C1 D1
+    *llr_128++ = simde_mm_unpacklo_epi64(xmm4, xmm6); // A2 B2 C2 D2
+    *llr_128++ = simde_mm_unpackhi_epi64(xmm4, xmm6); // A3 B3 C3 D3
+
+    rxF_128++;
+    ch_maga_128++;
+    ch_magb_128++;
+    ch_magc_128++;
+  }
+
+  if (nb_re & 3) {
+    for (int i = 0; i < (nb_re & 0x3); i++) {
+      int16_t *rxDataF_i16 = (int16_t *)rxF_128;
+      int16_t *ul_ch_mag_i16 = (int16_t *)ch_maga_128;
+      int16_t *ul_ch_magb_i16 = (int16_t *)ch_magb_128;
+      int16_t *ul_ch_magc_i16 = (int16_t *)ch_magc_128;
+      int16_t *ulsch_llr_i16 = (int16_t *)llr_128;
+      int16_t real = rxDataF_i16[2 * i + 0];
+      int16_t imag = rxDataF_i16[2 * i + 1];
+      int16_t mag_real = ul_ch_mag_i16[2 * i];
+      int16_t mag_imag = ul_ch_mag_i16[2 * i + 1];
+      ulsch_llr_i16[8 * i] = real;
+      ulsch_llr_i16[8 * i + 1] = imag;
+      ulsch_llr_i16[8 * i + 2] = saturating_sub(mag_real, abs(real));
+      ulsch_llr_i16[8 * i + 3] = saturating_sub(mag_imag, abs(imag));
+      int16_t magb_real = ul_ch_magb_i16[2 * i];
+      int16_t magb_imag = ul_ch_magb_i16[2 * i + 1];
+      ulsch_llr_i16[8 * i + 4] = saturating_sub(magb_real, abs(ulsch_llr_i16[8 * i + 2]));
+      ulsch_llr_i16[8 * i + 5] = saturating_sub(magb_imag, abs(ulsch_llr_i16[8 * i + 3]));
+      int16_t magc_real = ul_ch_magc_i16[2 * i];
+      int16_t magc_imag = ul_ch_magc_i16[2 * i + 1];
+      ulsch_llr_i16[8 * i + 6] = saturating_sub(magc_real, abs(ulsch_llr_i16[8 * i + 4]));
+      ulsch_llr_i16[8 * i + 7] = saturating_sub(magc_imag, abs(ulsch_llr_i16[8 * i + 5]));
+    }
+  }
  simde_mm_empty();
 }


--- a/openair1/PHY/NR_TRANSPORT/tests/CMakeLists.txt
+++ b/openair1/PHY/NR_TRANSPORT/tests/CMakeLists.txt
+add_executable(test_llr test_llr.cpp)
+target_link_libraries(test_llr PRIVATE PHY_NR GTest::gtest minimal_lib)
+add_dependencies(tests test_llr)
+add_test(NAME test_llr
+  COMMAND ./test_llr)
--- a/openair1/PHY/NR_TRANSPORT/tests/test_llr.cpp
+++ b/openair1/PHY/NR_TRANSPORT/tests/test_llr.cpp
+/*
+ * Licensed to the OpenAirInterface (OAI) Software Alliance under one or more
+ * contributor license agreements.  See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The OpenAirInterface Software Alliance licenses this file to You under
+ * the OAI Public License, Version 1.1  (the "License"); you may not use this file
+ * except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *      http://www.openairinterface.org/?page_id=698
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ *-------------------------------------------------------------------------------
+ * For more information about the OpenAirInterface (OAI) Software Alliance:
+ *      contact@openairinterface.org
+ */
+
+#include "gtest/gtest.h"
+#include <stdint.h>
+#include <vector>
+#include <algorithm>
+#include <numeric>
+extern "C" {
+void nr_ulsch_16qam_llr(int32_t *rxdataF_comp, int32_t *ul_ch_mag, int16_t *ulsch_llr, uint32_t nb_re, uint8_t symbol);
+void nr_ulsch_64qam_llr(int32_t *rxdataF_comp,
+                        int32_t *ul_ch_mag,
+                        int32_t *ul_ch_magb,
+                        int16_t *ulsch_llr,
+                        uint32_t nb_re,
+                        uint8_t symbol);
+void nr_ulsch_256qam_llr(int32_t *rxdataF_comp,
+                         int32_t *ul_ch_mag,
+                         int32_t *ul_ch_magb,
+                         int32_t *ul_ch_magc,
+                         int16_t *ulsch_llr,
+                         uint32_t nb_re,
+                         uint8_t symbol);
+struct configmodule_interface_s;
+struct configmodule_interface_s *uniqCfg = NULL;
+
+void exit_function(const char *file, const char *function, const int line, const char *s, const int assert)
+{
+  if (assert) {
+    abort();
+  } else {
+    exit(EXIT_SUCCESS);
+  }
+}
+#include "openair1/PHY/TOOLS/tools_defs.h"
+}
+#include <cstdio>
+#include "common/utils/LOG/log.h"
+#include <cstdlib>
+#include <memory>
+#include <random>
+
+constexpr bool is_power_of_two(uint64_t n)
+{
+  return n > 0 && (n & (n - 1)) == 0;
+}
+
+size_t align_up(size_t a, size_t b)
+{
+  return (a + b - 1) / b * b;
+}
+
+int16_t saturating_sub(int16_t a, int16_t b)
+{
+  int32_t result = (int32_t)a - (int32_t)b;
+
+  if (result < INT16_MIN) {
+    return INT16_MIN;
+  } else if (result > INT16_MAX) {
+    return INT16_MAX;
+  } else {
+    return (int16_t)result;
+  }
+}
+
+// Template adaptations for std::vector. This is needed because the avx functions expect 256 bit alignment.
+template <typename T, size_t alignment>
+class AlignedAllocator {
+ public:
+  static_assert(is_power_of_two(alignment), "Alignment should be power of 2");
+  static_assert(alignment >= 8, "Alignment must be at least 8 bits");
+  using value_type = T;
+
+  AlignedAllocator() = default;
+
+  AlignedAllocator(const AlignedAllocator &) = default;
+
+  AlignedAllocator &operator=(const AlignedAllocator &) = default;
+
+  template <typename U>
+  struct rebind {
+    using other = AlignedAllocator<U, alignment>;
+  };
+
+  T *allocate(size_t n)
+  {
+    size_t alignment_bytes = alignment / 8;
+    void *ptr = ::aligned_alloc(alignment_bytes, align_up(n * sizeof(T), alignment_bytes));
+    return static_cast<T *>(ptr);
+  }
+
+  void deallocate(T *p, size_t n)
+  {
+    ::free(p);
+  }
+};
+
+// Using 512-aligned vector in case some functions use avx-512
+template <typename T>
+using AlignedAllocator512 = AlignedAllocator<T, 512>;
+template <typename T>
+using AlignedVector512 = std::vector<T, AlignedAllocator512<T>>;
+
+void nr_ulsch_16qam_llr_ref(c16_t *rxdataF_comp, int32_t *ul_ch_mag, int16_t *ulsch_llr, uint32_t nb_re, uint8_t symbol)
+{
+  int16_t *ul_ch_mag_i16 = (int16_t *)ul_ch_mag;
+  for (auto i = 0U; i < nb_re; i++) {
+    int16_t real = rxdataF_comp[i].r;
+    int16_t imag = rxdataF_comp[i].i;
+    int16_t mag_real = ul_ch_mag_i16[2 * i];
+    int16_t mag_imag = ul_ch_mag_i16[2 * i + 1];
+    ulsch_llr[4 * i] = real;
+    ulsch_llr[4 * i + 1] = imag;
+    ulsch_llr[4 * i + 2] = saturating_sub(mag_real, std::abs(real));
+    ulsch_llr[4 * i + 3] = saturating_sub(mag_imag, std::abs(imag));
+  }
+}
+
+void nr_ulsch_64qam_llr_ref(c16_t *rxdataF_comp,
+                            int32_t *ul_ch_mag,
+                            int32_t *ul_ch_magb,
+                            int16_t *ulsch_llr,
+                            uint32_t nb_re,
+                            uint8_t symbol)
+{
+  int16_t *ul_ch_mag_i16 = (int16_t *)ul_ch_mag;
+  int16_t *ul_ch_magb_i16 = (int16_t *)ul_ch_magb;
+  for (auto i = 0U; i < nb_re; i++) {
+    int16_t real = rxdataF_comp[i].r;
+    int16_t imag = rxdataF_comp[i].i;
+    int16_t mag_real = ul_ch_mag_i16[2 * i];
+    int16_t mag_imag = ul_ch_mag_i16[2 * i + 1];
+    ulsch_llr[6 * i] = real;
+    ulsch_llr[6 * i + 1] = imag;
+    ulsch_llr[6 * i + 2] = saturating_sub(mag_real, std::abs(real));
+    ulsch_llr[6 * i + 3] = saturating_sub(mag_imag, std::abs(imag));
+    int16_t mag_realb = ul_ch_magb_i16[2 * i];
+    int16_t mag_imagb = ul_ch_magb_i16[2 * i + 1];
+    ulsch_llr[6 * i + 4] = saturating_sub(mag_realb, std::abs(ulsch_llr[6 * i + 2]));
+    ulsch_llr[6 * i + 5] = saturating_sub(mag_imagb, std::abs(ulsch_llr[6 * i + 3]));
+  }
+}
+
+void nr_ulsch_256qam_llr_ref(c16_t *rxdataF_comp,
+                             int32_t *ul_ch_mag,
+                             int32_t *ul_ch_magb,
+                             int32_t *ul_ch_magc,
+                             int16_t *ulsch_llr,
+                             uint32_t nb_re,
+                             uint8_t symbol)
+{
+  int16_t *ul_ch_mag_i16 = (int16_t *)ul_ch_mag;
+  int16_t *ul_ch_magb_i16 = (int16_t *)ul_ch_magb;
+  int16_t *ul_ch_magc_i16 = (int16_t *)ul_ch_magc;
+  for (auto i = 0U; i < nb_re; i++) {
+    int16_t real = rxdataF_comp[i].r;
+    int16_t imag = rxdataF_comp[i].i;
+    int16_t mag_real = ul_ch_mag_i16[2 * i];
+    int16_t mag_imag = ul_ch_mag_i16[2 * i + 1];
+    ulsch_llr[8 * i] = real;
+    ulsch_llr[8 * i + 1] = imag;
+    ulsch_llr[8 * i + 2] = saturating_sub(mag_real, std::abs(real));
+    ulsch_llr[8 * i + 3] = saturating_sub(mag_imag, std::abs(imag));
+    int16_t magb_real = ul_ch_magb_i16[2 * i];
+    int16_t magb_imag = ul_ch_magb_i16[2 * i + 1];
+    ulsch_llr[8 * i + 4] = saturating_sub(magb_real, std::abs(ulsch_llr[8 * i + 2]));
+    ulsch_llr[8 * i + 5] = saturating_sub(magb_imag, std::abs(ulsch_llr[8 * i + 3]));
+    int16_t magc_real = ul_ch_magc_i16[2 * i];
+    int16_t magc_imag = ul_ch_magc_i16[2 * i + 1];
+    ulsch_llr[8 * i + 6] = saturating_sub(magc_real, std::abs(ulsch_llr[8 * i + 4]));
+    ulsch_llr[8 * i + 7] = saturating_sub(magc_imag, std::abs(ulsch_llr[8 * i + 5]));
+  }
+}
+
+AlignedVector512<c16_t> generate_random_c16(size_t num)
+{
+  std::random_device rd;
+  std::mt19937 rng(rd());
+  std::uniform_int_distribution<int16_t> dist(INT16_MIN, INT16_MAX);
+  AlignedVector512<c16_t> vec;
+  vec.resize(num);
+  auto gen = [&]() { return (c16_t){dist(rng), dist(rng)}; };
+  std::generate(vec.begin(), vec.end(), gen);
+  return vec;
+}
+
+AlignedVector512<uint16_t> generate_random_uint16(size_t num)
+{
+  AlignedVector512<uint16_t> vec;
+  vec.resize(num);
+  auto gen = [&]() { return static_cast<uint16_t>(std::rand()); };
+  std::generate(vec.begin(), vec.end(), gen);
+  return vec;
+}
+
+void test_function_16_qam(AlignedVector512<uint32_t> nb_res)
+{
+  for (auto i = 0U; i < nb_res.size(); i++) {
+    uint32_t nb_re = nb_res[i];
+    auto rf_data = generate_random_c16(nb_re);
+    auto magnitude_data = generate_random_uint16(nb_re * 2);
+    AlignedVector512<uint64_t> ulsch_llr_ref;
+    ulsch_llr_ref.resize(nb_re);
+    std::fill(ulsch_llr_ref.begin(), ulsch_llr_ref.end(), 0);
+    nr_ulsch_16qam_llr_ref((c16_t *)rf_data.data(), (int32_t *)magnitude_data.data(), (int16_t *)ulsch_llr_ref.data(), nb_re, 0);
+
+    AlignedVector512<uint64_t> ulsch_llr;
+    ulsch_llr.resize(nb_re);
+    std::fill(ulsch_llr.begin(), ulsch_llr.end(), 0);
+    nr_ulsch_16qam_llr((int32_t *)rf_data.data(), (int32_t *)magnitude_data.data(), (int16_t *)ulsch_llr.data(), nb_re, 0);
+
+    int num_errors = 0;
+    for (auto i = 0U; i < nb_re; i++) {
+      EXPECT_EQ(ulsch_llr_ref[i], ulsch_llr[i])
+          << "Mismatch 16qam REF " << std::hex << ulsch_llr_ref[i] << " != DUT " << ulsch_llr[i] << " at " << std::dec << i;
+      if (ulsch_llr_ref[i] != ulsch_llr[i]) {
+        num_errors++;
+      }
+    }
+    EXPECT_EQ(num_errors, 0) << " Errors during testing 16qam llr " << num_errors << " nb res " << nb_re;
+  }
+}
+
+void test_function_64_qam(AlignedVector512<uint32_t> nb_res)
+{
+  for (auto i = 0U; i < nb_res.size(); i++) {
+    uint32_t nb_re = nb_res[i];
+    auto rf_data = generate_random_c16(nb_re);
+    auto magnitude_data = generate_random_uint16(nb_re * 2);
+    auto magnitude_b_data = generate_random_uint16(nb_re * 2);
+    AlignedVector512<uint32_t> ulsch_llr_ref;
+    ulsch_llr_ref.resize(nb_re * 3);
+    std::fill(ulsch_llr_ref.begin(), ulsch_llr_ref.end(), 0);
+    nr_ulsch_64qam_llr_ref((c16_t *)rf_data.data(),
+                           (int32_t *)magnitude_data.data(),
+                           (int32_t *)magnitude_b_data.data(),
+                           (int16_t *)ulsch_llr_ref.data(),
+                           nb_re,
+                           0);
+
+    AlignedVector512<uint32_t> ulsch_llr;
+    ulsch_llr.resize(nb_re * 3);
+    std::fill(ulsch_llr.begin(), ulsch_llr.end(), 0);
+    nr_ulsch_64qam_llr((int32_t *)rf_data.data(),
+                       (int32_t *)magnitude_data.data(),
+                       (int32_t *)magnitude_b_data.data(),
+                       (int16_t *)ulsch_llr.data(),
+                       nb_re,
+                       0);
+
+    int num_errors = 0;
+    for (auto i = 0U; i < nb_re * 3; i++) {
+      EXPECT_EQ(ulsch_llr_ref[i], ulsch_llr[i])
+          << "Mismatch 64qam REF " << std::hex << ulsch_llr_ref[i] << " != DUT " << ulsch_llr[i] << " at " << std::dec << i;
+      if (ulsch_llr_ref[i] != ulsch_llr[i]) {
+        num_errors++;
+      }
+    }
+    EXPECT_EQ(num_errors, 0) << " Errors during testing 64qam llr " << num_errors << " nb res " << nb_re;
+  }
+}
+
+void test_function_256_qam(AlignedVector512<uint32_t> nb_res)
+{
+  for (auto i = 0U; i < nb_res.size(); i++) {
+    uint32_t nb_re = nb_res[i];
+    auto rf_data = generate_random_c16(nb_re);
+    auto magnitude_data = generate_random_uint16(nb_re * 2);
+    auto magnitude_b_data = generate_random_uint16(nb_re * 2);
+    auto magnitude_c_data = generate_random_uint16(nb_re * 2);
+    AlignedVector512<uint32_t> ulsch_llr_ref;
+    ulsch_llr_ref.resize(nb_re * 4);
+    std::fill(ulsch_llr_ref.begin(), ulsch_llr_ref.end(), 0);
+    nr_ulsch_256qam_llr_ref((c16_t *)rf_data.data(),
+                            (int32_t *)magnitude_data.data(),
+                            (int32_t *)magnitude_b_data.data(),
+                            (int32_t *)magnitude_c_data.data(),
+                            (int16_t *)ulsch_llr_ref.data(),
+                            nb_re,
+                            0);
+
+    AlignedVector512<uint32_t> ulsch_llr;
+    ulsch_llr.resize(nb_re * 4);
+    std::fill(ulsch_llr.begin(), ulsch_llr.end(), 0);
+    nr_ulsch_256qam_llr((int32_t *)rf_data.data(),
+                        (int32_t *)magnitude_data.data(),
+                        (int32_t *)magnitude_b_data.data(),
+                        (int32_t *)magnitude_c_data.data(),
+                        (int16_t *)ulsch_llr.data(),
+                        nb_re,
+                        0);
+
+    int num_errors = 0;
+    for (auto i = 0U; i < nb_re * 4; i++) {
+      EXPECT_EQ(ulsch_llr_ref[i], ulsch_llr[i])
+          << "Mismatch 256qam REF " << std::hex << ulsch_llr_ref[i] << " != DUT " << ulsch_llr[i] << " at " << std::dec << i;
+      if (ulsch_llr_ref[i] != ulsch_llr[i]) {
+        num_errors++;
+      }
+    }
+    EXPECT_EQ(num_errors, 0) << " Errors during testing 256qam llr " << num_errors << " nb res " << nb_re;
+  }
+}
+
+TEST(test_llr, verify_reference_implementation_16qam)
+{
+  test_function_16_qam({16, 32, 24, 40, 48, 8 * 300});
+}
+
+TEST(test_llr, test_8_res_16qam)
+{
+  test_function_16_qam({8});
+}
+
+TEST(test_llr, test_4_res_16qam)
+{
+  test_function_16_qam({4});
+}
+
+TEST(test_llr, test_5_res_16qam)
+{
+  test_function_16_qam({5});
+}
+
+// This is a "normal" segfault because the function assumed extra buffer for reading non-existent REs
+TEST(test_llr, no_segmentation_fault_at_12_res_16qam)
+{
+  test_function_16_qam({12});
+}
+
+// This is a "normal" segfault because the function assumed extra buffer for reading non-existent REs
+TEST(test_llr, no_segmentation_fault_at_36_res_16qam)
+{
+  test_function_16_qam({36});
+}
+
+// any number of REs should work
+TEST(test_llr, no_segfault_any_number_of_re_16qam)
+{
+  for (uint32_t i = 0U; i < 1000U; i++) {
+    test_function_16_qam({i});
+  }
+}
+
+TEST(test_llr, verify_reference_implementation_64qam)
+{
+  test_function_64_qam({16, 24, 32, 80, 8 * 300});
+}
+
+TEST(test_llr, test_8_res_64qam)
+{
+  test_function_64_qam({8});
+}
+
+TEST(test_llr, test_4_res_64qam)
+{
+  test_function_64_qam({4});
+}
+
+// This is a "normal" segfault because the function assumed extra buffer for reading non-existent REs
+TEST(test_llr, no_segmentation_fault_at_12_res_64qam)
+{
+  test_function_64_qam({12});
+}
+
+// This is a "normal" segfault because the function assumed extra buffer for reading non-existent REs
+TEST(test_llr, no_segmentation_fault_at_36_res_64qam)
+{
+  test_function_64_qam({36});
+}
+
+// any number of REs should work
+TEST(test_llr, no_segfault_any_number_of_re_64qam)
+{
+  for (uint32_t i = 0U; i < 1000U; i++) {
+    test_function_64_qam({i});
+  }
+}
+
+TEST(test_llr, verify_reference_implementation_256qam)
+{
+  test_function_256_qam({16, 24, 32, 80, 8 * 300});
+}
+
+TEST(test_llr, test_8_res_256qam)
+{
+  test_function_256_qam({8});
+}
+
+TEST(test_llr, test_4_res_256qam)
+{
+  test_function_256_qam({4});
+}
+
+// This is a "normal" segfault because the function assumed extra buffer for reading non-existent REs
+TEST(test_llr, no_segmentation_fault_at_12_res_256qam)
+{
+  test_function_256_qam({12});
+}
+
+// This is a "normal" segfault because the function assumed extra buffer for reading non-existent REs
+TEST(test_llr, no_segmentation_fault_at_36_res_256qam)
+{
+  test_function_256_qam({36});
+}
+
+// any number of REs should work
+TEST(test_llr, no_segfault_any_number_of_re_256qam)
+{
+  for (uint32_t i = 0U; i < 1000U; i++) {
+    test_function_256_qam({i});
+  }
+}
+
+// It is possible to implement an AVX accelerated llr computation for multiples of 2REs.
+// This testcase can be used to verify this implementation as it visualizes LLR data with printfs
+TEST(test_llr, check_2_res_256_qam)
+{
+  AlignedVector512<c16_t> rf_data = {{1, 1}, {2, 2}};
+  AlignedVector512<int16_t> magnitude_data = {1, 1, 1, 1};
+  AlignedVector512<int16_t> magnitude_b_data = {2, 2, 2, 2};
+  AlignedVector512<int16_t> magnitude_c_data = {3, 3, 3, 3};
+  AlignedVector512<int16_t> ulsch_llr_ref;
+  ulsch_llr_ref.resize(2 * 8);
+  std::fill(ulsch_llr_ref.begin(), ulsch_llr_ref.end(), 0);
+  nr_ulsch_256qam_llr_ref((c16_t *)rf_data.data(),
+                          (int32_t *)magnitude_data.data(),
+                          (int32_t *)magnitude_b_data.data(),
+                          (int32_t *)magnitude_c_data.data(),
+                          (int16_t *)ulsch_llr_ref.data(),
+                          2,
+                          0);
+
+  AlignedVector512<int16_t> ulsch_llr;
+  ulsch_llr.resize(2 * 8);
+  std::fill(ulsch_llr.begin(), ulsch_llr.end(), 0);
+  nr_ulsch_256qam_llr((int32_t *)rf_data.data(),
+                      (int32_t *)magnitude_data.data(),
+                      (int32_t *)magnitude_b_data.data(),
+                      (int32_t *)magnitude_c_data.data(),
+                      (int16_t *)ulsch_llr.data(),
+                      2,
+                      0);
+
+  printf("\nDUT:\n");
+  for (auto i = 0U; i < 2; i++) {
+    printf("%d %d %d %d %d %d %d %d\n",
+           ulsch_llr[i * 8],
+           ulsch_llr[i * 8 + 1],
+           ulsch_llr[i * 8 + 2],
+           ulsch_llr[i * 8 + 3],
+           ulsch_llr[i * 8 + 4],
+           ulsch_llr[i * 8 + 5],
+           ulsch_llr[i * 8 + 6],
+           ulsch_llr[i * 8 + 7]);
+  }
+
+  printf("\nREF:\n");
+  for (auto i = 0U; i < 2; i++) {
+    printf("%d %d %d %d %d %d %d %d\n",
+           ulsch_llr_ref[i * 8],
+           ulsch_llr_ref[i * 8 + 1],
+           ulsch_llr_ref[i * 8 + 2],
+           ulsch_llr_ref[i * 8 + 3],
+           ulsch_llr_ref[i * 8 + 4],
+           ulsch_llr_ref[i * 8 + 5],
+           ulsch_llr_ref[i * 8 + 6],
+           ulsch_llr_ref[i * 8 + 7]);
+  }
+
+  int num_errors = 0;
+  for (auto i = 0U; i < 2 * 8; i++) {
+    EXPECT_EQ(ulsch_llr_ref[i], ulsch_llr[i])
+        << "Mismatch 256qam REF " << std::hex << ulsch_llr_ref[i] << " != DUT " << ulsch_llr[i] << " at " << std::dec << i;
+    if (ulsch_llr_ref[i] != ulsch_llr[i]) {
+      num_errors++;
+    }
+  }
+  EXPECT_EQ(num_errors, 0) << " Errors during testing 256qam llr " << num_errors << " nb res " << 2;
+}
+
+int main(int argc, char **argv)
+{
+  logInit();
+  testing::InitGoogleTest(&argc, argv);
+  return RUN_ALL_TESTS();
+}