Commit 92cef613 authored by Tsung-Yu Chan's avatar Tsung-Yu Chan

fix / 2-layer inner_rx

  align the buffer to 256-bit
parent c35025f5
......@@ -1347,8 +1347,12 @@ void inner_rx_qpsk_2layer (NR_DL_FRAME_PARMS *frame_parms,
int dmrs_symbol_flag,
int output_shift)
{
int32_t rxFext[nb_rx_ant][length + 4] __attribute__((aligned(32)));
int32_t chFext[nb_layer*nb_rx_ant][length + 4] __attribute__((aligned(32)));
int add_shift = 0;
if (length % 8)
add_shift = 8 - length % 8;
int32_t rxFext[nb_rx_ant][length + add_shift] __attribute__((aligned(32)));
int32_t chFext[nb_layer*nb_rx_ant][length + add_shift] __attribute__((aligned(32)));
for (int aarx = 0; aarx < nb_rx_ant; aarx++)
{
for (int aatx = 0; aatx < nb_layer; aatx++)
......@@ -1365,8 +1369,8 @@ void inner_rx_qpsk_2layer (NR_DL_FRAME_PARMS *frame_parms,
frame_parms);
}
}
int32_t rho[nb_layer*nb_layer][length + 4] __attribute__((aligned(32)));
int32_t rxFext_comp[nb_layer][length + 4] __attribute__((aligned(32)));
int32_t rho[nb_layer*nb_layer][length + add_shift] __attribute__((aligned(32)));
int32_t rxFext_comp[nb_layer][length + add_shift] __attribute__((aligned(32)));
for (int aarx = 0; aarx < nb_rx_ant; aarx++)
{
for (int aatx = 0; aatx < nb_layer; aatx++)
......@@ -1374,7 +1378,7 @@ void inner_rx_qpsk_2layer (NR_DL_FRAME_PARMS *frame_parms,
for (int atx = 0; atx < nb_layer; atx++)
{
#ifdef USE_128BIT
simde__m128i mmtmpD0, mmtmpD1, mmtmpD2, mmtmpD3;
simde__m128i mmtmpD0, mmtmpD1, mmtmpD2, mmtmpD3, mmtmpD4;
simde__m128i *rho128 = (simde__m128i *)rho[aatx*nb_layer+atx];
simde__m128i *ul_ch128 = (simde__m128i *)chFext[aatx * nb_rx_ant + aarx];
simde__m128i *ul_ch128_2 = (simde__m128i *)chFext[atx * nb_rx_ant + aarx];
......@@ -1392,10 +1396,11 @@ void inner_rx_qpsk_2layer (NR_DL_FRAME_PARMS *frame_parms,
mmtmpD1 = simde_mm_srai_epi32(mmtmpD1, output_shift);
mmtmpD2 = simde_mm_unpacklo_epi32(mmtmpD0, mmtmpD1);
mmtmpD3 = simde_mm_unpackhi_epi32(mmtmpD0, mmtmpD1);
mmtmpD4 = simde_mm_packs_epi32(mmtmpD2, mmtmpD3);
if (aarx == 0)
rho128[i] = simde_mm_packs_epi32(mmtmpD2, mmtmpD3);
rho128[i] = mmtmpD4;
else
rho128[i] = simde_mm_adds_epi16(rho128[i], simde_mm_packs_epi32(mmtmpD2, mmtmpD3));
rho128[i] = simde_mm_adds_epi16(rho128[i], mmtmpD4);
}
#else
simde__m256i mmtmpD0, mmtmpD1, mmtmpD2, mmtmpD3, mmtmpD4;
......@@ -1513,8 +1518,12 @@ void inner_rx_16qam_2layer (NR_DL_FRAME_PARMS *frame_parms,
int dmrs_symbol_flag,
int output_shift)
{
int32_t rxFext[nb_rx_ant][length + 4] __attribute__((aligned(32)));
int32_t chFext[nb_layer*nb_rx_ant][length + 4] __attribute__((aligned(32)));
int add_shift = 0;
if (length % 8)
add_shift = 8 - length % 8;
int32_t rxFext[nb_rx_ant][length + add_shift] __attribute__((aligned(32)));
int32_t chFext[nb_layer*nb_rx_ant][length + add_shift] __attribute__((aligned(32)));
for (int aarx = 0; aarx < nb_rx_ant; aarx++)
{
for (int aatx = 0; aatx < nb_layer; aatx++)
......@@ -1532,9 +1541,9 @@ void inner_rx_16qam_2layer (NR_DL_FRAME_PARMS *frame_parms,
}
}
int32_t rho[nb_layer*nb_layer][length + 4] __attribute__((aligned(32)));
int32_t rxFext_comp[nb_layer][length + 4] __attribute__((aligned(32)));
int32_t ul_ch_mag[nb_layer][length + 4] __attribute__((aligned(32)));
int32_t rho[nb_layer*nb_layer][length + add_shift] __attribute__((aligned(32)));
int32_t rxFext_comp[nb_layer][length + add_shift] __attribute__((aligned(32)));
int32_t ul_ch_mag[nb_layer][length + add_shift] __attribute__((aligned(32)));
for (int aatx = 0; aatx < nb_layer; aatx++)
{
for (int aarx = 0; aarx < nb_rx_ant; aarx++)
......@@ -1705,8 +1714,12 @@ void inner_rx_64qam_2layer (NR_DL_FRAME_PARMS *frame_parms,
int dmrs_symbol_flag,
int output_shift)
{
int32_t rxFext[nb_rx_ant][length + 4] __attribute__((aligned(32)));
int32_t chFext[nb_layer*nb_rx_ant][length + 4] __attribute__((aligned(32)));
int add_shift = 0;
if (length % 8)
add_shift = 8 - length % 8;
int32_t rxFext[nb_rx_ant][length + add_shift] __attribute__((aligned(32)));
int32_t chFext[nb_layer*nb_rx_ant][length + add_shift] __attribute__((aligned(32)));
for (int aarx = 0; aarx < nb_rx_ant; aarx++)
{
for (int aatx = 0; aatx < nb_layer; aatx++)
......@@ -1723,9 +1736,9 @@ void inner_rx_64qam_2layer (NR_DL_FRAME_PARMS *frame_parms,
frame_parms);
}
}
int32_t rho[nb_layer*nb_layer][length + 4] __attribute__((aligned(32)));
int32_t rxFext_comp[nb_layer][length + 4] __attribute__((aligned(32)));
int32_t ul_ch_mag[nb_layer][length + 4] __attribute__((aligned(32)));
int32_t rho[nb_layer*nb_layer][length + add_shift] __attribute__((aligned(32)));
int32_t rxFext_comp[nb_layer][length + add_shift] __attribute__((aligned(32)));
int32_t ul_ch_mag[nb_layer][length + add_shift] __attribute__((aligned(32)));
for (int aatx = 0; aatx < nb_layer; aatx++)
{
for (int aarx = 0; aarx < nb_rx_ant; aarx++)
......
......@@ -3746,6 +3746,20 @@ void nr_ulsch_shift_llr(int16_t **llr_layers, uint32_t nb_re, uint32_t rxdataF_e
simde__m128i *llr_layers0 = (simde__m128i *)&llr_layers[0][rxdataF_ext_offset];
simde__m128i *llr_layers1 = (simde__m128i *)&llr_layers[1][rxdataF_ext_offset];
uint8_t mem_offset = ((16 - ((long)llr_layers0)) & 0xF) >> 2;
if (mem_offset > 0) {
c16_t *llr_layers0_c16 = (c16_t *)&llr_layers[0][rxdataF_ext_offset];
c16_t *llr_layers1_c16 = (c16_t *)&llr_layers[1][rxdataF_ext_offset];
for (int i = 0; i < mem_offset; i++)
{
llr_layers0_c16[i] = c16Shift(llr_layers0_c16[i], shift);
llr_layers1_c16[i] = c16Shift(llr_layers1_c16[i], shift);
}
llr_layers0 = (simde__m128i *)&llr_layers[0][rxdataF_ext_offset + (mem_offset << 1)];
llr_layers1 = (simde__m128i *)&llr_layers[1][rxdataF_ext_offset + (mem_offset << 1)];
}
for (int i = 0; i < nb_re >> 2; i++) {
llr_layers0[i] = simde_mm_srai_epi16(llr_layers0[i], shift);
llr_layers1[i] = simde_mm_srai_epi16(llr_layers1[i], shift);
......
Markdown is supported
0%
or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment