Commit 651ef3da authored by Robert Schmidt's avatar Robert Schmidt

Merge remote-tracking branch 'origin/resource_mapping_optim' into integration_2025_w07 (!3127)

Optimizations of PDSCH Resource Mapping in nr_dlsch.c/nr_modulation.c

These changes add SIMD optimizations for Neon/AVX2/AVX512 in the PDSCH
transmit path. The timing improvements are listed here based on the

    nr_dlsim -e25 -R273 -b273 -s30 -x "layers" -y 4 -z 4 -P

benchmark with "layers" 2,3,4 and comparing "PHY proc tx":

273 PRBS, mcs25, 64QAM

peafowl (gcc11,AMD EPYC 9374F)

    2-layer, 4 TX : 431 us (develop 565 us)
    3-layer, 4 TX : 692 us (develop 849 us)
    4-layer, 4 TX : 963 us (develop 1172 us)

stupix (gcc10, Xeon Gold 6354)

    2-layer, 4 TX : 568 us (develop 652 us)
    3-layer, 4 TX : 901 us (develop 1030 us)
    4-layer, 4 TX : 1250 us (develop 1396 us)

matix (gcc14, Ryzen 9 PRO 7945)

    2-layer, 4 TX : 317 us (develop 505 us)
    3-layer, 4 TX : 538 us (develop 779 us)
    4-layer, 4 TX : 767 us (develop 1233 us)
parents f0d6d42a 8a63b013
......@@ -22,10 +22,15 @@
#include "nr_modulation.h"
#include "PHY/NR_REFSIG/nr_mod_table.h"
#include "executables/softmodem-common.h"
#include <simde/x86/avx512.h>
// Lacking declaration in present simde external package, will be detected as compilation error when they will add it
#define simde_mm512_extracti64x2_epi64(a...) _mm512_extracti64x2_epi64(a)
// #define DEBUG_DLSCH_PRECODING_PRINT_WITH_TRIVIAL // TODO: For debug, to be removed if want to merge to develop
//Table 6.3.1.5-1 Precoding Matrix W 1 layer 2 antenna ports 'n' = -1 and 'o' = -j
// #define DEBUG_LAYER_MAPPING
#define USE_NEON
// #define USE_GATHER
// Table 6.3.1.5-1 Precoding Matrix W 1 layer 2 antenna ports 'n' = -1 and 'o' = -j
const char nr_W_1l_2p[6][2][1] = {
{{'1'}, {'0'}}, // pmi 0
{{'0'}, {'1'}},
......@@ -35,7 +40,7 @@ const char nr_W_1l_2p[6][2][1] = {
{{'1'}, {'o'}} // pmi 5
};
//Table 6.3.1.5-3 Precoding Matrix W 1 layer 4 antenna ports 'n' = -1 and 'o' = -j
// Table 6.3.1.5-3 Precoding Matrix W 1 layer 4 antenna ports 'n' = -1 and 'o' = -j
const char nr_W_1l_4p[28][4][1] = {
{{'1'}, {'0'}, {'0'}, {'0'}}, // pmi 0
{{'0'}, {'1'}, {'0'}, {'0'}},
......@@ -54,7 +59,7 @@ const char nr_W_1l_4p[28][4][1] = {
{{'1'}, {'1'}, {'n'}, {'n'}},
{{'1'}, {'1'}, {'o'}, {'o'}},
{{'1'}, {'j'}, {'1'}, {'j'}}, // pmi
// 16
// 16
{{'1'}, {'j'}, {'j'}, {'n'}},
{{'1'}, {'j'}, {'n'}, {'o'}},
{{'1'}, {'j'}, {'o'}, {'1'}},
......@@ -68,14 +73,14 @@ const char nr_W_1l_4p[28][4][1] = {
{{'1'}, {'o'}, {'o'}, {'n'}} // pmi 27
};
//Table 6.3.1.5-4 Precoding Matrix W 2 antenna ports layers 2 'n' = -1 and 'o' = -j
// Table 6.3.1.5-4 Precoding Matrix W 2 antenna ports layers 2 'n' = -1 and 'o' = -j
const char nr_W_2l_2p[3][2][2] = {
{{'1', '0'}, {'0', '1'}}, // pmi 0
{{'1', '1'}, {'1', 'n'}},
{{'1', '1'}, {'j', 'o'}} // pmi 2
};
//Table 6.3.1.5-5 Precoding Matrix W 2 layers 4 antenna ports 'n' = -1 and 'o' = -j
// Table 6.3.1.5-5 Precoding Matrix W 2 layers 4 antenna ports 'n' = -1 and 'o' = -j
const char nr_W_2l_4p[22][4][2] = {
{{'1', '0'}, {'0', '1'}, {'0', '0'}, {'0', '0'}}, // pmi 0
{{'1', '0'}, {'0', '0'}, {'0', '1'}, {'0', '0'}}, {{'1', '0'}, {'0', '0'}, {'0', '0'}, {'0', '1'}},
......@@ -94,7 +99,7 @@ const char nr_W_2l_4p[22][4][2] = {
{{'1', '1'}, {'o', 'o'}, {'1', 'n'}, {'o', 'j'}}, {{'1', '1'}, {'o', 'o'}, {'j', 'o'}, {'1', 'n'}} // pmi 21
};
//Table 6.3.1.5-6 Precoding Matrix W 3 layers 4 antenna ports 'n' = -1 and 'o' = -j
// Table 6.3.1.5-6 Precoding Matrix W 3 layers 4 antenna ports 'n' = -1 and 'o' = -j
const char nr_W_3l_4p[7][4][3] = {{{'1', '0', '0'}, {'0', '1', '0'}, {'0', '0', '1'}, {'0', '0', '0'}}, // pmi 0
{{'1', '0', '0'}, {'0', '1', '0'}, {'1', '0', '0'}, {'0', '0', '1'}},
{{'1', '0', '0'}, {'0', '1', '0'}, {'n', '0', '0'}, {'0', '0', '1'}},
......@@ -103,7 +108,7 @@ const char nr_W_3l_4p[7][4][3] = {{{'1', '0', '0'}, {'0', '1', '0'}, {'0', '0',
{{'1', '1', '1'}, {'n', '1', 'n'}, {'1', '1', 'n'}, {'n', '1', '1'}},
{{'1', '1', '1'}, {'n', '1', 'n'}, {'j', 'j', 'o'}, {'o', 'j', 'j'}}};
//Table 6.3.1.5-7 Precoding Matrix W 4 layers 4 antenna ports 'n' = -1 and 'o' = -j
// Table 6.3.1.5-7 Precoding Matrix W 4 layers 4 antenna ports 'n' = -1 and 'o' = -j
const char nr_W_4l_4p[5][4][4] = {
{{'1', '0', '0', '0'}, {'0', '1', '0', '0'}, {'0', '0', '1', '0'}, {'0', '0', '0', '1'}}, // pmi 0
{{'1', '1', '0', '0'}, {'0', '0', '1', '1'}, {'1', 'n', '0', '0'}, {'0', '0', '1', 'n'}},
......@@ -112,137 +117,133 @@ const char nr_W_4l_4p[5][4][4] = {
{{'1', '1', '1', '1'}, {'1', 'n', '1', 'n'}, {'j', 'j', 'o', 'o'}, {'j', 'o', 'o', 'j'}} // pmi 4
};
void nr_modulation(const uint32_t *in,
uint32_t length,
uint16_t mod_order,
int16_t *out)
void nr_modulation(const uint32_t *in, uint32_t length, uint16_t mod_order, int16_t *out)
{
uint16_t mask = ((1<<mod_order)-1);
int32_t* nr_mod_table32;
int32_t* out32 = (int32_t*) out;
uint16_t mask = ((1 << mod_order) - 1);
int32_t *nr_mod_table32;
int32_t *out32 = (int32_t *)out;
const uint8_t *in_bytes = (const uint8_t *)in;
const uint64_t *in64 = (const uint64_t *)in;
int64_t* out64 = (int64_t*) out;
uint32_t i=0;
int64_t *out64 = (int64_t *)out;
uint32_t i = 0;
#if defined(__SSE2__)
simde__m128i *nr_mod_table128;
simde__m128i *out128;
#endif
LOG_D(PHY,"nr_modulation: length %d, mod_order %d\n",length,mod_order);
LOG_D(PHY, "nr_modulation: length %d, mod_order %d\n", length, mod_order);
switch (mod_order) {
#if defined(__SSE2__)
case 2:
nr_mod_table128 = (simde__m128i *)nr_qpsk_byte_mod_table;
out128 = (simde__m128i *)out;
for (i=0; i<length/8; i++)
out128[i] = nr_mod_table128[in_bytes[i]];
// the bits that are left out
i = i*8/2;
nr_mod_table32 = (int32_t*) nr_qpsk_mod_table;
while (i<length/2) {
const int idx = ((in_bytes[(i * 2) / 8] >> ((i * 2) & 0x7)) & mask);
out32[i] = nr_mod_table32[idx];
i++;
}
return;
case 2:
nr_mod_table128 = (simde__m128i *)nr_qpsk_byte_mod_table;
out128 = (simde__m128i *)out;
for (i = 0; i < length / 8; i++)
out128[i] = nr_mod_table128[in_bytes[i]];
// the bits that are left out
i = i * 8 / 2;
nr_mod_table32 = (int32_t *)nr_qpsk_mod_table;
while (i < length / 2) {
const int idx = ((in_bytes[(i * 2) / 8] >> ((i * 2) & 0x7)) & mask);
out32[i] = nr_mod_table32[idx];
i++;
}
return;
#else
case 2:
nr_mod_table32 = (int32_t*) nr_qpsk_mod_table;
for (i=0; i<length/mod_order; i++) {
const int idx = ((in[i * 2 / 32] >> ((i * 2) & 0x1f)) & mask);
out32[i] = nr_mod_table32[idx];
}
return;
case 2:
nr_mod_table32 = (int32_t *)nr_qpsk_mod_table;
for (i = 0; i < length / mod_order; i++) {
const int idx = ((in[i * 2 / 32] >> ((i * 2) & 0x1f)) & mask);
out32[i] = nr_mod_table32[idx];
}
return;
#endif
case 4:
out64 = (int64_t*) out;
for (i=0; i<length/8; i++)
out64[i] = nr_16qam_byte_mod_table[in_bytes[i]];
// the bits that are left out
i = i*8/4;
while (i<length/4) {
const int idx = ((in_bytes[(i * 4) / 8] >> ((i * 4) & 0x7)) & mask);
out32[i] = nr_16qam_mod_table[idx];
i++;
}
return;
case 4:
out64 = (int64_t *)out;
for (i = 0; i < length / 8; i++)
out64[i] = nr_16qam_byte_mod_table[in_bytes[i]];
// the bits that are left out
i = i * 8 / 4;
while (i < length / 4) {
const int idx = ((in_bytes[(i * 4) / 8] >> ((i * 4) & 0x7)) & mask);
out32[i] = nr_16qam_mod_table[idx];
i++;
}
return;
case 6:
if (length > (3*64))
for (i = 0; i < length - 3 * 64; i += 3 * 64) {
uint64_t x = *in64++;
uint64_t x1 = x & 0xfff;
*out64++ = nr_64qam_mod_table[x1];
x1 = (x >> 12) & 0xfff;
*out64++ = nr_64qam_mod_table[x1];
x1 = (x >> 24) & 0xfff;
*out64++ = nr_64qam_mod_table[x1];
x1 = (x >> 36) & 0xfff;
*out64++ = nr_64qam_mod_table[x1];
x1 = (x >> 48) & 0xfff;
*out64++ = nr_64qam_mod_table[x1];
uint64_t x2 = (x >> 60);
x = *in64++;
x2 |= x<<4;
x1 = x2 & 0xfff;
*out64++ = nr_64qam_mod_table[x1];
x1 = (x2 >> 12) & 0xfff;
*out64++ = nr_64qam_mod_table[x1];
x1 = (x2 >> 24) & 0xfff;
*out64++ = nr_64qam_mod_table[x1];
x1 = (x2 >> 36) & 0xfff;
*out64++ = nr_64qam_mod_table[x1];
x1 = (x2 >> 48) & 0xfff;
*out64++ = nr_64qam_mod_table[x1];
x2 = ((x>>56)&0xf0) | (x2>>60);
x = *in64++;
x2 |= x<<8;
x1 = x2 & 0xfff;
*out64++ = nr_64qam_mod_table[x1];
x1 = (x2 >> 12) & 0xfff;
*out64++ = nr_64qam_mod_table[x1];
x1 = (x2 >> 24) & 0xfff;
case 6:
if (length > (3 * 64))
for (i = 0; i < length - 3 * 64; i += 3 * 64) {
uint64_t x = *in64++;
uint64_t x1 = x & 0xfff;
*out64++ = nr_64qam_mod_table[x1];
x1 = (x >> 12) & 0xfff;
*out64++ = nr_64qam_mod_table[x1];
x1 = (x >> 24) & 0xfff;
*out64++ = nr_64qam_mod_table[x1];
x1 = (x >> 36) & 0xfff;
*out64++ = nr_64qam_mod_table[x1];
x1 = (x >> 48) & 0xfff;
*out64++ = nr_64qam_mod_table[x1];
uint64_t x2 = (x >> 60);
x = *in64++;
x2 |= x << 4;
x1 = x2 & 0xfff;
*out64++ = nr_64qam_mod_table[x1];
x1 = (x2 >> 12) & 0xfff;
*out64++ = nr_64qam_mod_table[x1];
x1 = (x2 >> 24) & 0xfff;
*out64++ = nr_64qam_mod_table[x1];
x1 = (x2 >> 36) & 0xfff;
*out64++ = nr_64qam_mod_table[x1];
x1 = (x2 >> 48) & 0xfff;
*out64++ = nr_64qam_mod_table[x1];
x2 = ((x >> 56) & 0xf0) | (x2 >> 60);
x = *in64++;
x2 |= x << 8;
x1 = x2 & 0xfff;
*out64++ = nr_64qam_mod_table[x1];
x1 = (x2 >> 12) & 0xfff;
*out64++ = nr_64qam_mod_table[x1];
x1 = (x2 >> 24) & 0xfff;
*out64++ = nr_64qam_mod_table[x1];
x1 = (x2 >> 36) & 0xfff;
*out64++ = nr_64qam_mod_table[x1];
x1 = (x2 >> 48) & 0xfff;
*out64++ = nr_64qam_mod_table[x1];
x2 = ((x >> 52) & 0xff0) | (x2 >> 60);
*out64++ = nr_64qam_mod_table[x2];
}
while (i + 24 <= length) {
uint32_t xx = 0;
memcpy(&xx, in_bytes + i / 8, 3);
uint64_t x1 = xx & 0xfff;
*out64++ = nr_64qam_mod_table[x1];
x1 = (x2 >> 36) & 0xfff;
x1 = (xx >> 12) & 0xfff;
*out64++ = nr_64qam_mod_table[x1];
x1 = (x2 >> 48) & 0xfff;
i += 24;
}
if (i != length) {
uint32_t xx = 0;
memcpy(&xx, in_bytes + i / 8, 2);
uint64_t x1 = xx & 0xfff;
*out64++ = nr_64qam_mod_table[x1];
x2 = ((x>>52)&0xff0) | (x2>>60);
*out64++ = nr_64qam_mod_table[x2];
}
while (i + 24 <= length) {
uint32_t xx = 0;
memcpy(&xx, in_bytes + i / 8, 3);
uint64_t x1 = xx & 0xfff;
*out64++ = nr_64qam_mod_table[x1];
x1 = (xx >> 12) & 0xfff;
*out64++ = nr_64qam_mod_table[x1];
i += 24;
}
if (i != length) {
uint32_t xx = 0;
memcpy(&xx, in_bytes + i / 8, 2);
uint64_t x1 = xx & 0xfff;
*out64++ = nr_64qam_mod_table[x1];
}
return;
return;
case 8:
nr_mod_table32 = (int32_t*) nr_256qam_mod_table;
for (i=0; i<length/8; i++)
out32[i] = nr_mod_table32[in_bytes[i]];
return;
case 8:
nr_mod_table32 = (int32_t *)nr_256qam_mod_table;
for (i = 0; i < length / 8; i++)
out32[i] = nr_mod_table32[in_bytes[i]];
return;
default:
break;
default:
break;
}
AssertFatal(false,"Invalid or unsupported modulation order %d\n",mod_order);
AssertFatal(false, "Invalid or unsupported modulation order %d\n", mod_order);
}
void nr_layer_mapping(int nbCodes,
......@@ -251,68 +252,309 @@ void nr_layer_mapping(int nbCodes,
uint8_t n_layers,
int layerSz,
uint32_t n_symbs,
c16_t tx_layer[layerSz],
int layer)
c16_t tx_layers[][layerSz])
{
LOG_D(PHY,"Doing layer mapping for %d layers, %d symbols\n",n_layers,n_symbs);
LOG_D(PHY, "Doing layer mapping for %d layers, %d symbols\n", n_layers, n_symbs);
c16_t *mod = mod_symbs[0];
switch (n_layers) {
case 1:
memcpy(tx_layer, mod_symbs[0], n_symbs * sizeof(**mod_symbs));
memcpy(tx_layers[0], mod, n_symbs * sizeof(**mod_symbs));
break;
case 2:
case 3:
case 4:
for (int i = 0; i < n_symbs / n_layers; i++) {
const c16_t *base = mod_symbs[0] + n_layers * i;
tx_layer[i] = base[layer];
}
break;
case 2: {
int i = 0;
c16_t *tx0 = tx_layers[0];
c16_t *tx1 = tx_layers[1];
#if defined(__AVX512BW__)
simde__m512i perm2a = simde_mm512_set_epi32(30, 28, 26, 24, 22, 20, 18, 16, 14, 12, 10, 8, 6, 4, 2, 0);
simde__m512i perm2b = simde_mm512_set_epi32(31, 29, 27, 25, 23, 21, 19, 17, 15, 13, 11, 9, 7, 5, 3, 1);
for (; i < (n_symbs & ~31); i += 32) {
simde__m512i a = *(simde__m512i *)(mod + i);
simde__m512i b = *(simde__m512i *)(mod + i + 16);
*(simde__m512i *)tx0 = simde_mm512_permutex2var_epi32(a, perm2a, b);
*(simde__m512i *)tx1 = simde_mm512_permutex2var_epi32(a, perm2b, b);
tx0 += 16;
tx1 += 16;
}
#endif
#ifdef __AVX2__
simde__m256i perm2 = simde_mm256_set_epi32(7, 5, 3, 1, 6, 4, 2, 0);
for (; i < (n_symbs & ~7); i += 8) {
simde__m256i d = simde_mm256_permutevar8x32_epi32(*(simde__m256i *)(mod + i), perm2);
*(simde__m128i *)tx0 = simde_mm256_extractf128_si256(d, 0);
*(simde__m128i *)tx1 = simde_mm256_extractf128_si256(d, 1);
tx0 += 4;
tx1 += 4;
}
#endif
#if defined(__aarch64__) && defined(USE_NEON)
// SIMDe doesn't handle this properly, gcc up to 14.2 neither
uint8_t const perm0[16] = {0, 1, 2, 3, 8, 9, 10, 11, 4, 5, 6, 7, 12, 13, 14, 15};
uint8x16_t perm = vld1q_u8(perm0);
uint8x16_t d;
for (; i < (n_symbs & (~3)); i += 4) {
d = vqtbl1q_u8(*(uint8x16_t *)(mod + i), perm);
*(int64_t *)tx0 = vgetq_lane_u64((uint64x2_t)d, 0);
*(int64_t *)tx1 = vgetq_lane_u64((uint64x2_t)d, 1);
tx0 += 2;
tx1 += 2;
}
#endif
for (; i < n_symbs; i += 2) {
*tx0++ = mod[i];
*tx1++ = mod[i + 1];
}
} break;
case 3: {
int i = 0;
c16_t *tx0 = tx_layers[0];
c16_t *tx1 = tx_layers[1];
c16_t *tx2 = tx_layers[2];
#if defined(__AVX512BW__)
simde__m512i perm3_0 = simde_mm512_set_epi32(13 + 16,
10 + 16,
7 + 16,
4 + 16,
1 + 16,
14 + 16,
11 + 16,
8 + 16,
5 + 16,
2 + 16,
15,
12,
9,
6,
3,
0);
simde__m512i perm3_0b = simde_mm512_set_epi32(13 + 16, 10 + 16, 7 + 16, 4 + 16, 1 + 16, 10, 9, 8, 7, 6, 5, 4, 3, 2, 1, 0);
simde__m512i perm3_1 = simde_mm512_set_epi32(14 + 16,
11 + 16,
8 + 16,
5 + 16,
2 + 16,
15 + 16,
12 + 16,
9 + 16,
6 + 16,
3 + 16,
0 + 16,
13,
10,
7,
4,
1);
simde__m512i perm3_1b = simde_mm512_set_epi32(14 + 16, 11 + 16, 8 + 16, 5 + 16, 2 + 16, 10, 9, 8, 7, 6, 5, 4, 3, 2, 1, 0);
simde__m512i perm3_2 = simde_mm512_set_epi32(15 + 16,
12 + 16,
9 + 16,
6 + 16,
3 + 16,
0 + 16,
13 + 16,
10 + 16,
7 + 16,
4 + 16,
1 + 16,
14,
11,
8,
5,
2);
simde__m512i perm3_2b = simde_mm512_set_epi32(15 + 16, 12 + 16, 9 + 16, 6 + 16, 3 + 16, 0 + 16, 9, 8, 7, 6, 5, 4, 3, 2, 1, 0);
for (; i < (n_symbs & ~63); i += 48) {
simde__m512i i0 = *(simde__m512i *)(mod + i);
simde__m512i i1 = *(simde__m512i *)(mod + i + 16);
simde__m512i i2 = *(simde__m512i *)(mod + i + 32);
simde__m512i d0 = simde_mm512_permutex2var_epi32(i0, perm3_0, i1);
*(simde__m512i *)tx0 = simde_mm512_permutex2var_epi32(d0, perm3_0b, i2); // 11000000
tx0 += 16;
d0 = simde_mm512_permutex2var_epi32(i0, perm3_1, i1);
*(simde__m512i *)tx1 = simde_mm512_permutex2var_epi32(d0, perm3_1b, i2); // 11000000
tx1 += 16;
d0 = simde_mm512_permutex2var_epi32(i0, perm3_2, i1);
*(simde__m512i *)tx2 = simde_mm512_permutex2var_epi32(d0, perm3_2b, i2); // 11000000
tx2 += 16;
}
#endif
#ifdef __AVX2__
{
simde__m256i perm3_0 = simde_mm256_set_epi32(5, 2, 7, 4, 1, 6, 3, 0);
simde__m256i perm3_1 = simde_mm256_set_epi32(6, 3, 0, 5, 2, 7, 4, 1);
simde__m256i perm3_2 = simde_mm256_set_epi32(7, 4, 1, 6, 3, 0, 5, 2);
for (; i < (n_symbs & ~31); i += 24) {
simde__m256i i0 = *(simde__m256i *)(mod + i);
simde__m256i i1 = *(simde__m256i *)(mod + i + 8);
simde__m256i i2 = *(simde__m256i *)(mod + i + 16);
simde__m256i d0 = simde_mm256_permutevar8x32_epi32(i0, perm3_0);
simde__m256i d1 = simde_mm256_permutevar8x32_epi32(i1, perm3_0);
simde__m256i d2 = simde_mm256_permutevar8x32_epi32(i2, perm3_0);
simde__m256i d3 = simde_mm256_blend_epi32(d0, d1, 0x38); // 00111000
*(simde__m256i *)tx0 = simde_mm256_blend_epi32(d3, d2, 0xc0); // 11000000
tx0 += 8;
d0 = simde_mm256_permutevar8x32_epi32(i0, perm3_1);
d1 = simde_mm256_permutevar8x32_epi32(i1, perm3_1);
d2 = simde_mm256_permutevar8x32_epi32(i2, perm3_1);
d3 = simde_mm256_blend_epi32(d0, d1, 0x18); // 00011000
*(simde__m256i *)tx1 = simde_mm256_blend_epi32(d3, d2, 0xe0); // 11100000
tx1 += 8;
d0 = simde_mm256_permutevar8x32_epi32(i0, perm3_2);
d1 = simde_mm256_permutevar8x32_epi32(i1, perm3_2);
d2 = simde_mm256_permutevar8x32_epi32(i2, perm3_2);
d3 = simde_mm256_blend_epi32(d0, d1, 0x1c); // 00011100
*(simde__m256i *)tx2 = simde_mm256_blend_epi32(d3, d2, 0xe0); // 11100000
tx2 += 8;
}
}
#endif
for (; i < n_symbs; i += 3) {
*tx0++ = mod[i];
*tx1++ = mod[i + 1];
*tx2++ = mod[i + 2];
}
case 5:
if (layer < 2)
for (int i = 0; i < n_symbs; i += 2) {
const int txIdx = i / 2;
tx_layer[txIdx] = mod_symbs[0][i + layer];
#ifdef DEBUG_LAYER_MAPPING
printf("\nsymb %d/%d\n", i << 3, n_symbs);
printf(" layer 0:\t");
for (int j = 0; j < 8 * 6; j += 6) {
printf("%d %d ", ((int16_t *)&mod[i << 3])[j], ((int16_t *)&mod[i << 3])[j + 1]);
}
printf("\n layer 1:\t");
for (int j = 2; j < 8 * 6; j += 6) {
printf("%d %d ", ((int16_t *)&mod[i << 3])[j], ((int16_t *)&mod[i << 3])[j + 1]);
}
printf("\n layer 2:\t");
for (int j = 4; j < 8 * 6; j += 6) {
printf("%d %d ", ((int16_t *)&mod[i << 3])[j], ((int16_t *)&mod[i << 3])[j + 1]);
}
printf("\n Mapping layer 0:\t");
for (int j = 0; j < 16; j++) {
printf("%d ", ((int16_t *)&tx_layers[0][n << 3])[j]);
}
printf("\n Mapping layer 1:\t");
for (int j = 0; j < 16; j++) {
printf("%d ", ((int16_t *)&tx_layers[1][n << 3])[j]);
}
printf("\n Mapping layer 2:\t");
for (int j = 0; j < 16; j++) {
printf("%d ", ((int16_t *)&tx_layers[2][n << 3])[j]);
}
#endif
} break;
case 4: {
int i = 0;
c16_t *tx0 = tx_layers[0];
c16_t *tx1 = tx_layers[1];
c16_t *tx2 = tx_layers[2];
c16_t *tx3 = tx_layers[3];
#if defined(__AVX512BW__)
simde__m512i perm4 = simde_mm512_set_epi32(15, 11, 7, 3, 14, 10, 6, 2, 13, 9, 5, 1, 12, 8, 4, 0);
for (; i < (n_symbs & ~15); i += 16) {
simde__m512i e = simde_mm512_permutexvar_epi32(perm4, *(simde__m512i *)(mod + i));
*(simde__m128i *)tx0 = simde_mm512_extracti64x2_epi64(e, 0);
tx0 += 4;
*(simde__m128i *)tx1 = simde_mm512_extracti64x2_epi64(e, 1);
tx1 += 4;
*(simde__m128i *)tx2 = simde_mm512_extracti64x2_epi64(e, 2);
tx2 += 4;
*(simde__m128i *)tx3 = simde_mm512_extracti64x2_epi64(e, 3);
tx3 += 4;
}
#endif
#ifdef __AVX2__
{
simde__m256i perm4 = simde_mm256_set_epi32(7, 3, 6, 2, 5, 1, 4, 0);
for (; i < (n_symbs & ~7); i += 8) {
simde__m256i e = simde_mm256_permutevar8x32_epi32(*(simde__m256i *)(mod + i), perm4);
*(uint64_t *)tx0 = simde_mm256_extract_epi64(e, 0);
tx0 += 2;
*(uint64_t *)tx1 = simde_mm256_extract_epi64(e, 1);
tx1 += 2;
*(uint64_t *)tx2 = simde_mm256_extract_epi64(e, 2);
tx2 += 2;
*(uint64_t *)tx3 = simde_mm256_extract_epi64(e, 3);
tx3 += 2;
}
}
#endif
#if defined(__aarch64__) && defined(USE_NEON)
// SIMDe doesn't handle this properly, gcc up to 14.2 neither
for (; i < (n_symbs & ~3); i += 4) {
uint32x4_t d4 = *(uint32x4_t *)(mod + i);
*(uint32_t *)tx0 = vgetq_lane_u32(d4, 0);
tx0++;
*(uint32_t *)tx1 = vgetq_lane_u32(d4, 1);
tx1++;
*(uint32_t *)tx2 = vgetq_lane_u32(d4, 0);
tx2++;
*(uint32_t *)tx3 = vgetq_lane_u32(d4, 1);
tx3++;
}
#endif
for (; i < n_symbs; i += 4) {
*tx0++ = mod[i];
*tx1++ = mod[i + 1];
*tx2++ = mod[i + 2];
*tx3++ = mod[i + 3];
}
} break;
case 5:
case 6:
case 7:
case 8:
/*
// Layer 0,1
for (int i = 0; i < n_symbs; i += 2) {
const int txIdx = i / 2;
tx_layer[0][txIdx] = mod_symbs[0][i];
tx_layer[1][txIdx] = mod_symbs[0][i + 1];
}
// layers 2,3,4
else
for (int i = 0; i < n_symbs; i += 3) {
const int txIdx = i / 3;
tx_layer[txIdx] = mod_symbs[1][i + layer];
}
for (int i = 0; i < n_symbs; i += 3) {
const int txIdx = i / 3;
tx_layer[2][txIdx] = mod_symbs[1][i + 2];
tx_layer[3][txIdx] = mod_symbs[1][i + 3];
tx_layer[4][txIdx] = mod_symbs[1][i + 4];
}
break;
case 6:
case 6:
for (int q=0; q<2; q++)
for (int i = 0; i < n_symbs; i += 3) {
const int txIdx = i / 3;
tx_layer[txIdx] = mod_symbs[q][i + layer];
}
for (int i = 0; i < n_symbs; i += 3) {
const int txIdx = i / 3;
tx_layer[0][txIdx] = mod_symbs[q][i + layer];
tx_layer[1][txIdx] = mod_symbs[q][i + layer];
tx_layer[2][txIdx] = mod_symbs[q][i + layer];
tx_layer[3][txIdx] = mod_symbs[q][i + layer];
tx_layer[4][txIdx] = mod_symbs[q][i + layer];
tx_layer[5][txIdx] = mod_symbs[q][i + layer];
}
break;
case 7:
case 7:
if (layer < 3)
for (int i = 0; i < n_symbs; i += 3) {
const int txIdx = i / 3;
tx_layer[txIdx] = mod_symbs[1][i + layer];
}
for (int i = 0; i < n_symbs; i += 3) {
const int txIdx = i / 3;
tx_layer[txIdx] = mod_symbs[1][i + layer];
}
else
for (int i = 0; i < n_symbs; i += 4) {
const int txIdx = i / 4;
tx_layer[txIdx] = mod_symbs[0][i + layer];
}
for (int i = 0; i < n_symbs; i += 4) {
const int txIdx = i / 4;
tx_layer[txIdx] = mod_symbs[0][i + layer];
}
break;
case 8:
case 8:
for (int q=0; q<2; q++)
for (int i = 0; i < n_symbs; i += 4) {
const int txIdx = i / 4;
tx_layer[txIdx] = mod_symbs[q][i + layer];
const int txIdx = i / 4;
tx_layer[txIdx] = mod_symbs[q][i + layer];
}
break;
*/
default:
AssertFatal(0, "Invalid number of layers %d\n", n_layers);
}
......@@ -337,7 +579,7 @@ void nr_dft(c16_t *z, c16_t *d, uint32_t Msc_PUSCH)
simde__m128i norm128;
if ((Msc_PUSCH % 1536) > 0) {
for (i = 0, ip = 0; i < Msc_PUSCH; i++, ip+=4) {
for (i = 0, ip = 0; i < Msc_PUSCH; i++, ip += 4) {
dft_in0[ip] = d[i];
}
}
......@@ -357,19 +599,18 @@ void nr_dft(c16_t *z, c16_t *d, uint32_t Msc_PUSCH)
}
if ((Msc_PUSCH % 1536) > 0) {
for (i = 0, ip = 0; i < Msc_PUSCH; i++, ip+=4)
for (i = 0, ip = 0; i < Msc_PUSCH; i++, ip += 4)
z[i] = dft_out0[ip];
}
}
void perform_symbol_rotation(NR_DL_FRAME_PARMS *fp, double f0, c16_t *symbol_rotation)
{
const int nsymb = fp->symbols_per_slot * fp->slots_per_frame/10;
const double Tc=(1/480e3/4096);
const double Nu=2048*64*(1/(float)(1<<fp->numerology_index));
const double Ncp0=16*64 + (144*64*(1/(float)(1<<fp->numerology_index)));
const double Ncp1=(144*64*(1/(float)(1<<fp->numerology_index)));
const int nsymb = fp->symbols_per_slot * fp->slots_per_frame / 10;
const double Tc = (1 / 480e3 / 4096);
const double Nu = 2048 * 64 * (1 / (float)(1 << fp->numerology_index));
const double Ncp0 = 16 * 64 + (144 * 64 * (1 / (float)(1 << fp->numerology_index)));
const double Ncp1 = (144 * 64 * (1 / (float)(1 << fp->numerology_index)));
LOG_D(PHY, "Doing symbol rotation calculation for TX/RX, f0 %f Hz, Nsymb %d\n", f0, nsymb);
......@@ -430,7 +671,9 @@ void init_timeshift_rotation(NR_DL_FRAME_PARMS *fp)
fp->timeshift_symbol_rotation[i].i = (int16_t)round(exp_im * 32767);
if (i < 10)
LOG_D(PHY,"Timeshift symbol rotation %d => (%d,%d) %f\n",i,
LOG_D(PHY,
"Timeshift symbol rotation %d => (%d,%d) %f\n",
i,
fp->timeshift_symbol_rotation[i].r,
fp->timeshift_symbol_rotation[i].i,
poff);
......@@ -441,13 +684,13 @@ c16_t nr_layer_precoder(int sz, c16_t datatx_F_precoding[][sz], const char *prec
{
c16_t precodatatx_F = {0};
for (int al = 0; al<n_layers; al++) {
for (int al = 0; al < n_layers; al++) {
c16_t antenna = datatx_F_precoding[al][re_offset];
switch (prec_matrix[al]) {
case '0': //multiply by zero
case '0': // multiply by zero
break;
case '1': //multiply by 1
case '1': // multiply by 1
precodatatx_F = c16add(precodatatx_F, antenna);
break;
......@@ -474,33 +717,29 @@ c16_t nr_layer_precoder(int sz, c16_t datatx_F_precoding[][sz], const char *prec
}
c16_t nr_layer_precoder_cm(int n_layers,
int n_symbols,
int symSz,
c16_t datatx_F_precoding[n_layers][n_symbols][symSz],
c16_t datatx_F_precoding[n_layers][symSz],
int ap,
nfapi_nr_pm_pdu_t *pmi_pdu,
int symbol,
int offset)
{
c16_t precodatatx_F = {0};
for (int al = 0; al < n_layers; al++) {
nfapi_nr_pm_weights_t *w = &pmi_pdu->weights[al][ap];
c16_t prec_weight = {.r = w->precoder_weight_Re, .i = w->precoder_weight_Im};
precodatatx_F = c16maddShift(datatx_F_precoding[al][symbol][offset], prec_weight, precodatatx_F, 15);
precodatatx_F = c16maddShift(datatx_F_precoding[al][offset], prec_weight, precodatatx_F, 15);
}
return precodatatx_F;
}
void nr_layer_precoder_simd(const int n_layers,
const int n_symbols,
const int symSz,
const c16_t txdataF_res_mapped[n_layers][n_symbols][symSz],
const int ant,
const nfapi_nr_pm_pdu_t *pmi_pdu,
const int symbol,
const int sc_offset,
const int re_cnt,
c16_t *txdataF_precoded)
const int symSz,
const c16_t txdataF_res_mapped[n_layers][symSz],
const int ant,
const nfapi_nr_pm_pdu_t *pmi_pdu,
const int sc_offset,
const int re_cnt,
c16_t *txdataF_precoded)
{
uint32_t sc = sc_offset;
c16_t prec_weight = {0};
......@@ -510,22 +749,25 @@ void nr_layer_precoder_simd(const int n_layers,
// 256 SIMD: Do 8 RE in one iteration, 3 iterations for 2 RB
#ifdef __AVX2__
const uint32_t re_cnt_align8 = re_cnt & ~7;
for(; sc < sc_offset + (re_cnt_align8); sc += sizeof(simde__m256i) / sizeof(prec_weight)) {
for (; sc < sc_offset + (re_cnt_align8); sc += sizeof(simde__m256i) / sizeof(prec_weight)) {
// Matrix multiplication for 4 elements of the result (sizeof(simde__m256i) / sizeof(*prec_matrix) = 8)
simde__m256i y = simde_mm256_set1_epi16(0); // Y = W[0]*X[0] + W[1]*X[1] + ... + W[nrOfLayers-1]*X[nrOfLayers-1]
for(int nl = 0; nl < n_layers; nl++) {
for (int nl = 0; nl < n_layers; nl++) {
prec_weight.r = pmi_pdu->weights[nl][ant].precoder_weight_Re;
prec_weight.i = pmi_pdu->weights[nl][ant].precoder_weight_Im;
const simde__m256i x = simde_mm256_loadu_si256(&txdataF_res_mapped[nl][symbol][sc]);
const simde__m256i x = simde_mm256_loadu_si256(&txdataF_res_mapped[nl][sc]);
// Rearrange precoding matrix weight to match complex multiplication and broadcast it to match SIMD size
const simde__m256i w_c = simde_mm256_set1_epi32(c16toI32(c16conj(prec_weight))); // broadcast conjugate of w
const simde__m256i w_s = simde_mm256_set1_epi32(c16toI32(c16swap(prec_weight))); // broadcast swapped real and img of w
const simde__m256i w_c = simde_mm256_set1_epi32(c16toI32(c16conj(prec_weight))); // broadcast conjugate of w
const simde__m256i w_s = simde_mm256_set1_epi32(c16toI32(c16swap(prec_weight))); // broadcast swapped real and img of w
// Multiplication and shift
const simde__m256i reals = simde_mm256_srai_epi32(simde_mm256_madd_epi16(x, w_c), 15); // (int32_t) .r = (x.r * w.r - x.i * w.i) >> 15
const simde__m256i imags = simde_mm256_slli_epi32(simde_mm256_madd_epi16(x, w_s), 1); // (int32_t) .i = (x.r * w.i + x.i * w.r) << 1, since higher 16 bit of each 32 bit is taken by blend_epi16
const simde__m256i reals =
simde_mm256_srai_epi32(simde_mm256_madd_epi16(x, w_c), 15); // (int32_t) .r = (x.r * w.r - x.i * w.i) >> 15
const simde__m256i imags = simde_mm256_slli_epi32(
simde_mm256_madd_epi16(x, w_s),
1); // (int32_t) .i = (x.r * w.i + x.i * w.r) << 1, since higher 16 bit of each 32 bit is taken by blend_epi16
// Re-arrange to match c16_t format
const simde__m256i produ = simde_mm256_blend_epi16(reals, imags, 0xAA);
......@@ -540,43 +782,40 @@ void nr_layer_precoder_simd(const int n_layers,
// 128 SIMD: Do 4 RE in one iteration, 3 iterations for 1 RB
const uint32_t re_cnt_align4 = re_cnt & ~3;
for(; sc < sc_offset+re_cnt_align4; sc += sizeof(simde__m128i) / sizeof(prec_weight)) {
#ifdef DEBUG_DLSCH_PRECODING_PRINT_WITH_TRIVIAL // Get result with trivial solution, TODO: To be removed
c16_t y_triv[4];
for(int i = 0; i < 4; i++)
y_triv[i] = nr_layer_precoder_cm(n_layers,
NR_SYMBOLS_PER_SLOT,
symSz,
txdataF_res_mapped,
ant,
pmi_pdu,
symbol,
sc + i);
memcpy(&txdataF_precoded[sc], y_triv, sizeof(y_triv));
#endif
for (; sc < sc_offset + re_cnt_align4; sc += sizeof(simde__m128i) / sizeof(prec_weight)) {
#ifdef DEBUG_DLSCH_PRECODING_PRINT_WITH_TRIVIAL // Get result with trivial solution, TODO: To be removed
c16_t y_triv[4];
for (int i = 0; i < 4; i++)
y_triv[i] = nr_layer_precoder_cm(n_layers, symSz, txdataF_res_mapped, ant, pmi_pdu, sc + i);
memcpy(&txdataF_precoded[sc], y_triv, sizeof(y_triv));
#endif
// Matrix multiplication for 4 elements of the result (sizeof(simde__m128i) / sizeof(c16_t) = 4)
simde__m128i y = simde_mm_set1_epi16(0); // Y = W[0]*X[0] + W[1]*X[1] + ... + W[nrOfLayers-1]*X[nrOfLayers-1]
for(int nl = 0; nl < n_layers; nl++) {
for (int nl = 0; nl < n_layers; nl++) {
prec_weight.r = pmi_pdu->weights[nl][ant].precoder_weight_Re;
prec_weight.i = pmi_pdu->weights[nl][ant].precoder_weight_Im;
const simde__m128i x = simde_mm_loadu_si128(&txdataF_res_mapped[nl][symbol][sc]);
const simde__m128i x = simde_mm_loadu_si128(&txdataF_res_mapped[nl][sc]);
// Rearrange precoding matrix weight to match complex multiplication and broadcast it to match SIMD size
const simde__m128i w_c = simde_mm_set1_epi32(c16toI32(c16conj(prec_weight))); // broadcast conjugate of w
const simde__m128i w_s = simde_mm_set1_epi32(c16toI32(c16swap(prec_weight))); // broadcast swapped real and img of w
const simde__m128i w_c = simde_mm_set1_epi32(c16toI32(c16conj(prec_weight))); // broadcast conjugate of w
const simde__m128i w_s = simde_mm_set1_epi32(c16toI32(c16swap(prec_weight))); // broadcast swapped real and img of w
// Multiplication and shift
const simde__m128i reals = simde_mm_srai_epi32(simde_mm_madd_epi16(x, w_c), 15); // (int32_t) .r = (x.r * w.r - x.i * w.i) >> 15
const simde__m128i imags = simde_mm_slli_epi32(simde_mm_madd_epi16(x, w_s), 1); // (int32_t) .i = (x.r * w.i + x.i * w.r) << 1, since higher 16 bit of each 32 bit is taken by blend_epi16
const simde__m128i reals =
simde_mm_srai_epi32(simde_mm_madd_epi16(x, w_c), 15); // (int32_t) .r = (x.r * w.r - x.i * w.i) >> 15
const simde__m128i imags = simde_mm_slli_epi32(
simde_mm_madd_epi16(x, w_s),
1); // (int32_t) .i = (x.r * w.i + x.i * w.r) << 1, since higher 16 bit of each 32 bit is taken by blend_epi16
/* Re-arrange to match c16_t format
bit index: 0 | 16 | 32 | 48 | 64 | 80 | 96 | 112
reals = {R0.r[15..30] | R0.r[31] (0)*15 | R1.r[15..30] | R1.r[31] (0)*15 | R2.r[15..30] | R2.r[31] (0)*15 | R3.r[15..30] | R3.r[31] (0)*15}
imags = {0 R0.i[0..14]| R0.i[15..30] | 0 R1.i[0..14]| R1.i[15..30] | 0 R2.i[0..14]| R2.i[15..30] | 0 R3.i[0..14]| R3.i[15..30] }
16b from {reals | imags | reals | imags | reals | imags | reals | imags }
produ = {R0.r[15..30] | R0.i[15..30] | R1.r[15..30] | R1.i[15..30] | R2.r[15..30] | R2.i[15..30] | R3.r[15..30] | R3.i[15..30] }
bit index: 0 | 16 | 32 | 48 | 64 | 80 | 96 | 112
reals = {R0.r[15..30] | R0.r[31] (0)*15 | R1.r[15..30] | R1.r[31] (0)*15 | R2.r[15..30] | R2.r[31] (0)*15 | R3.r[15..30]
| R3.r[31] (0)*15} imags = {0 R0.i[0..14]| R0.i[15..30] | 0 R1.i[0..14]| R1.i[15..30] | 0 R2.i[0..14]| R2.i[15..30]
| 0 R3.i[0..14]| R3.i[15..30] } 16b from {reals | imags | reals | imags | reals | imags | reals
| imags } produ = {R0.r[15..30] | R0.i[15..30] | R1.r[15..30] | R1.i[15..30] | R2.r[15..30] | R2.i[15..30] |
R3.r[15..30] | R3.i[15..30] }
*/
const simde__m128i produ = simde_mm_blend_epi16(reals, imags, 0xAA);
......@@ -586,30 +825,30 @@ void nr_layer_precoder_simd(const int n_layers,
// Store the result to txdataF
simde_mm_storeu_si128(&txdataF_precoded[sc], y);
#ifdef DEBUG_DLSCH_PRECODING_PRINT_WITH_TRIVIAL // Print simd and trivial result, TODO: To be removed
c16_t *y_simd = (c16_t*) &y;
printf("debug_to_be_removed re_cnt=%d, sc=%u, y_simd=(%+4d,%+4d), (%+4d,%+4d), (%+4d,%+4d), (%+4d,%+4d)\n",
re_cnt,
sc,
y_simd[0].r,
y_simd[0].i,
y_simd[1].r,
y_simd[1].i,
y_simd[2].r,
y_simd[2].i,
y_simd[3].r,
y_simd[3].i);
printf("debug_to_be_removed re_cnt=%d, sc=%u, y_triv=(%+4d,%+4d), (%+4d,%+4d), (%+4d,%+4d), (%+4d,%+4d)\n",
re_cnt,
sc,
y_triv[0].r,
y_triv[0].i,
y_triv[1].r,
y_triv[1].i,
y_triv[2].r,
y_triv[2].i,
y_triv[3].r,
y_triv[3].i);
#endif
#ifdef DEBUG_DLSCH_PRECODING_PRINT_WITH_TRIVIAL // Print simd and trivial result, TODO: To be removed
c16_t *y_simd = (c16_t *)&y;
printf("debug_to_be_removed re_cnt=%d, sc=%u, y_simd=(%+4d,%+4d), (%+4d,%+4d), (%+4d,%+4d), (%+4d,%+4d)\n",
re_cnt,
sc,
y_simd[0].r,
y_simd[0].i,
y_simd[1].r,
y_simd[1].i,
y_simd[2].r,
y_simd[2].i,
y_simd[3].r,
y_simd[3].i);
printf("debug_to_be_removed re_cnt=%d, sc=%u, y_triv=(%+4d,%+4d), (%+4d,%+4d), (%+4d,%+4d), (%+4d,%+4d)\n",
re_cnt,
sc,
y_triv[0].r,
y_triv[0].i,
y_triv[1].r,
y_triv[1].i,
y_triv[2].r,
y_triv[2].i,
y_triv[3].r,
y_triv[3].i);
#endif
}
}
......@@ -60,8 +60,7 @@ void nr_layer_mapping(int nbCodes,
uint8_t n_layers,
int layerSz,
uint32_t n_symbs,
c16_t tx_layers[layerSz],
int l);
c16_t tx_layers[][layerSz]);
/*! \brief Perform NR layer mapping. TS 38.211 V15.4.0 subclause 7.3.1.3
@param[in] ulsch_ue, double Pointer to NR_UE_ULSCH_t struct
......@@ -136,12 +135,10 @@ void apply_nr_rotation_RX(const NR_DL_FRAME_PARMS *frame_parms,
c16_t nr_layer_precoder(int sz, c16_t datatx_F_precoding[][sz], const char *prec_matrix, uint8_t n_layers, int32_t re_offset);
c16_t nr_layer_precoder_cm(int n_layers,
int n_symbols,
int symSz,
c16_t datatx_F_precoding[n_layers][n_symbols][symSz],
c16_t datatx_F_precoding[n_layers][symSz],
int ap,
nfapi_nr_pm_pdu_t *pmi_pdu,
int symbol,
int offset);
/*! \brief Precoding with SIMDe, txdataF_precoded[] = prec_matrix[] * txdataF_res_mapped[]
......@@ -151,13 +148,11 @@ c16_t nr_layer_precoder_cm(int n_layers,
@param[out] txdataF_precoded Precoded antenna data
*/
void nr_layer_precoder_simd(const int n_layers,
const int n_symbols,
const int symSz,
const c16_t txdataF_res_mapped[n_layers][n_symbols][symSz],
const int ant,
const nfapi_nr_pm_pdu_t *pmi_pdu,
const int symbol,
const int sc_offset,
const int re_cnt,
c16_t *txdataF_precoded);
const int symSz,
const c16_t txdataF_res_mapped[n_layers][symSz],
const int ant,
const nfapi_nr_pm_pdu_t *pmi_pdu,
const int sc_offset,
const int re_cnt,
c16_t *txdataF_precoded);
#endif
......@@ -249,9 +249,11 @@ void nr_generate_dci_top(processingData_L1tx_t *msgTx, int slot, int txdataF_off
{
PHY_VARS_gNB *gNB = msgTx->gNB;
NR_DL_FRAME_PARMS *frame_parms = &gNB->frame_parms;
start_meas(&gNB->dci_generation_stats);
for (int i = 0; i < msgTx->num_ul_pdcch; i++)
nr_generate_dci(msgTx->gNB, &msgTx->ul_pdcch_pdu[i].pdcch_pdu.pdcch_pdu_rel15, txdataF_offset, frame_parms, slot);
for (int i = 0; i < msgTx->num_dl_pdcch; i++)
nr_generate_dci(msgTx->gNB, &msgTx->pdcch_pdu[i].pdcch_pdu_rel15, txdataF_offset, frame_parms, slot);
stop_meas(&gNB->dci_generation_stats);
}
......@@ -41,39 +41,761 @@
#include "executables/softmodem-common.h"
#include "SCHED_NR/sched_nr.h"
//#define DEBUG_DLSCH
//#define DEBUG_DLSCH_MAPPING
// #define DEBUG_DLSCH
// #define DEBUG_DLSCH_MAPPING
#include <simde/x86/avx512.h>
#define USE128BIT
static void nr_pdsch_codeword_scrambling(uint8_t *in, uint32_t size, uint8_t q, uint32_t Nid, uint32_t n_RNTI, uint32_t *out)
{
nr_codeword_scrambling(in, size, q, Nid, n_RNTI, out);
}
static int do_ptrs_symbol(nfapi_nr_dl_tti_pdsch_pdu_rel15_t *rel15,
int start_sc,
int symbol_sz,
c16_t *txF,
c16_t *tx_layer,
int amp,
c16_t *mod_ptrs)
{
int ptrs_idx = 0;
int k = start_sc;
c16_t *in = tx_layer;
for (int i = 0; i < rel15->rbSize * NR_NB_SC_PER_RB; i++) {
/* check for PTRS symbol and set flag for PTRS RE */
bool is_ptrs_re =
is_ptrs_subcarrier(k, rel15->rnti, rel15->PTRSFreqDensity, rel15->rbSize, rel15->PTRSReOffset, start_sc, symbol_sz);
if (is_ptrs_re) {
/* check if cuurent RE is PTRS RE*/
uint16_t beta_ptrs = 1;
txF[k] = c16mulRealShift(mod_ptrs[ptrs_idx], beta_ptrs * amp, 15);
#ifdef DEBUG_DLSCH_MAPPING
printf("ptrs_idx %d\t \t k %d \t \t txdataF: %d %d, mod_ptrs: %d %d\n",
ptrs_idx,
k,
txF[k].r,
txF[k].i,
mod_ptrs[ptrs_idx].r,
mod_ptrs[ptrs_idx].i);
#endif
ptrs_idx++;
} else {
txF[k] = c16mulRealShift(*in++, amp, 15);
#ifdef DEBUG_DLSCH_MAPPING
printf("k %d \t txdataF: %d %d\n", k, txF[k].r, txF[k].i);
#endif
}
if (++k >= symbol_sz)
k -= symbol_sz;
}
return in - tx_layer;
}
typedef union {
uint64_t l;
c16_t s[2];
} amp_t;
static inline int interleave_with_0_signal_first(c16_t *output, c16_t *mod_dmrs, const int amp_dmrs, int sz)
{
#ifdef DEBUG_DLSCH_MAPPING
printf("doing DMRS pattern for port 0 : d0 0 d1 0 ... dNm2 0 dNm1 0 (ul %d, rr %d)\n", upper_limit, remaining_re);
#endif
// add filler to process all as SIMD
c16_t *out = output;
int i = 0;
int end = sz / 2;
#if defined(__AVX512BW__)
simde__m512i zeros512 = simde_mm512_setzero_si512(), amp_dmrs512 = simde_mm512_set1_epi16(amp_dmrs);
simde__m512i perml = simde_mm512_set_epi32(23, 7, 22, 6, 21, 5, 20, 4, 19, 3, 18, 2, 17, 1, 16, 0);
simde__m512i permh = simde_mm512_set_epi32(31, 15, 30, 14, 29, 13, 28, 12, 27, 11, 26, 10, 25, 9, 24, 8);
for (; i < (end & ~15); i += 16) {
simde__m512i d0 = simde_mm512_mulhrs_epi16(_mm512_loadu_si512((simde__m512i *)(mod_dmrs + i)), amp_dmrs512);
simde_mm512_storeu_si512((simde__m512i *)out, simde_mm512_permutex2var_epi32(d0, perml, zeros512));
out += 16;
simde_mm512_storeu_si512((simde__m512i *)out, simde_mm512_permutex2var_epi32(d0, permh, zeros512));
out += 16;
}
#endif
#if defined(__AVX2__)
simde__m256i zeros256 = simde_mm256_setzero_si256(), amp_dmrs256 = simde_mm256_set1_epi16(amp_dmrs);
for (; i < (end & ~7); i += 8) {
simde__m256i d0 = simde_mm256_mulhrs_epi16(simde_mm256_loadu_si256((simde__m256i *)(mod_dmrs + i)), amp_dmrs256);
simde__m256i d2 = simde_mm256_unpacklo_epi32(d0, zeros256);
simde__m256i d3 = simde_mm256_unpackhi_epi32(d0, zeros256);
simde_mm256_storeu_si256((simde__m256i *)out, simde_mm256_permute2x128_si256(d2, d3, 32));
out += 8;
simde_mm256_storeu_si256((simde__m256i *)out, simde_mm256_permute2x128_si256(d2, d3, 49));
out += 8;
}
#endif
#if defined(USE128BIT)
simde__m128i zeros = simde_mm_setzero_si128(), amp_dmrs128 = simde_mm_set1_epi16(amp_dmrs);
for (; i < (end & ~3); i += 4) {
simde__m128i d0 = simde_mm_mulhrs_epi16(simde_mm_loadu_si128((simde__m128i *)(mod_dmrs + i)), amp_dmrs128);
simde__m128i d2 = simde_mm_unpacklo_epi32(d0, zeros);
simde__m128i d3 = simde_mm_unpackhi_epi32(d0, zeros);
simde_mm_storeu_si128((simde__m128i *)out, d2);
out += 4;
simde_mm_storeu_si128((simde__m128i *)out, d3);
out += 4;
}
#endif
for (; i < end; i++) {
*out++ = c16mulRealShift(mod_dmrs[i], amp_dmrs, 15);
*out++ = (c16_t){};
}
return 0;
}
static inline int interleave_with_0_start_with_0(c16_t *output, c16_t *mod_dmrs, const int amp_dmrs, int sz)
{
#ifdef DEBUG_DLSCH_MAPPING
printf("doing DMRS pattern for port 2 : 0 d0 0 d1 ... 0 dNm2 0 dNm1\n");
#endif
c16_t *out = output;
int i = 0;
int end = sz / 2;
#if defined(__AVX512BW__)
simde__m512i zeros512 = simde_mm512_setzero_si512(), amp_dmrs512 = simde_mm512_set1_epi16(amp_dmrs);
simde__m512i perml = simde_mm512_set_epi32(23, 7, 22, 6, 21, 5, 20, 4, 19, 3, 18, 2, 17, 1, 16, 0);
simde__m512i permh = simde_mm512_set_epi32(31, 15, 30, 14, 29, 13, 28, 12, 27, 11, 26, 10, 25, 9, 24, 8);
for (; i < (end & ~15); i += 16) {
simde__m512i d0 = simde_mm512_mulhrs_epi16(_mm512_loadu_si512((simde__m512i *)(mod_dmrs + i)), amp_dmrs512);
simde_mm512_storeu_si512((simde__m512i *)out, simde_mm512_permutex2var_epi32(zeros512, perml, d0));
out += 16;
simde_mm512_storeu_si512((simde__m512i *)out, simde_mm512_permutex2var_epi32(zeros512, permh, d0));
out += 16;
}
#endif
#if defined(__AVX2__)
simde__m256i zeros256 = simde_mm256_setzero_si256(), amp_dmrs256 = simde_mm256_set1_epi16(amp_dmrs);
for (; i < (end & ~7); i += 8) {
simde__m256i d0 = simde_mm256_mulhrs_epi16(simde_mm256_loadu_si256((simde__m256i *)(mod_dmrs + i)), amp_dmrs256);
simde__m256i d2 = simde_mm256_unpacklo_epi32(zeros256, d0);
simde__m256i d3 = simde_mm256_unpackhi_epi32(zeros256, d0);
simde_mm256_storeu_si256((simde__m256i *)out, simde_mm256_permute2x128_si256(d2, d3, 32));
out += 8;
simde_mm256_storeu_si256((simde__m256i *)out, simde_mm256_permute2x128_si256(d2, d3, 49));
out += 8;
}
#endif
#if defined(USE128BIT)
simde__m128i zeros = simde_mm_setzero_si128(), amp_dmrs128 = simde_mm_set1_epi16(amp_dmrs);
for (; i < (end & ~3); i += 4) {
simde__m128i d0 = simde_mm_mulhrs_epi16(simde_mm_loadu_si128((simde__m128i *)(mod_dmrs + i)), amp_dmrs128);
simde__m128i d2 = simde_mm_unpacklo_epi32(zeros, d0);
simde__m128i d3 = simde_mm_unpackhi_epi32(zeros, d0);
simde_mm_storeu_si128((simde__m128i *)out, d2);
out += 4;
simde_mm_storeu_si128((simde__m128i *)out, d3);
out += 4;
}
#endif
for (; i < end; i++) {
*out++ = (c16_t){};
*out++ = c16mulRealShift(mod_dmrs[i], amp_dmrs, 15);
}
return 0;
}
static inline int interleave_signals(c16_t *output, c16_t *signal1, const int amp, c16_t *signal2, const int amp2, int sz)
{
#ifdef DEBUG_DLSCH_MAPPING
printf("doing DMRS pattern for port 0 : d0 X0 d1 X1 ... dNm2 XNm2 dNm1 XNm1\n");
#endif
// add filler to process all as SIMD
c16_t *out = output;
int i = 0;
int end = sz / 2;
#if defined(__AVX512BW__)
simde__m512i amp2512 = simde_mm512_set1_epi16(amp2), amp512 = simde_mm512_set1_epi16(amp);
simde__m512i perml = simde_mm512_set_epi32(23, 7, 22, 6, 21, 5, 20, 4, 19, 3, 18, 2, 17, 1, 16, 0);
simde__m512i permh = simde_mm512_set_epi32(31, 15, 30, 14, 29, 13, 28, 12, 27, 11, 26, 10, 25, 9, 24, 8);
for (; i < (end & ~15); i += 16) {
simde__m512i d0 = simde_mm512_mulhrs_epi16(_mm512_loadu_si512((simde__m512i *)(signal2 + i)), amp2512);
simde__m512i d1 = simde_mm512_mulhrs_epi16(_mm512_loadu_si512((simde__m512i *)(signal1 + i)), amp512);
simde_mm512_storeu_si512((simde__m512i *)out, simde_mm512_permutex2var_epi32(d0, perml, d1));
out += 16;
simde_mm512_storeu_si512((simde__m512i *)out, simde_mm512_permutex2var_epi32(d0, permh, d1));
out += 16;
}
#endif
#if defined(__AVX2__)
simde__m256i amp2256 = simde_mm256_set1_epi16(amp2), amp256 = simde_mm256_set1_epi16(amp);
for (; i < (end & ~7); i += 8) {
simde__m256i d0 = simde_mm256_mulhrs_epi16(simde_mm256_loadu_si256((simde__m256i *)(signal2 + i)), amp2256);
simde__m256i d1 = simde_mm256_mulhrs_epi16(simde_mm256_loadu_si256((simde__m256i *)(signal1 + i)), amp256);
simde__m256i d2 = simde_mm256_unpacklo_epi32(d0, d1);
simde__m256i d3 = simde_mm256_unpackhi_epi32(d0, d1);
simde_mm256_storeu_si256((simde__m256i *)out, simde_mm256_permute2x128_si256(d2, d3, 32));
out += 8;
simde_mm256_storeu_si256((simde__m256i *)out, simde_mm256_permute2x128_si256(d2, d3, 49));
out += 8;
}
#endif
#if defined(USE128BIT)
simde__m128i amp2128 = simde_mm_set1_epi16(amp2), amp128 = simde_mm_set1_epi16(amp);
for (; i < (end & ~3); i += 4) {
simde__m128i d0 = simde_mm_mulhrs_epi16(simde_mm_loadu_si128((simde__m128i *)(signal2 + i)), amp2128);
simde__m128i d1 = simde_mm_mulhrs_epi16(simde_mm_loadu_si128((simde__m128i *)(signal1 + i)), amp128);
simde__m128i d2 = simde_mm_unpacklo_epi32(d0, d1);
simde__m128i d3 = simde_mm_unpackhi_epi32(d0, d1);
simde_mm_storeu_si128((simde__m128i *)out, d2);
out += 4;
simde_mm_storeu_si128((simde__m128i *)out, d3);
out += 4;
}
#endif
for (; i < end; i++) {
*out++ = c16mulRealShift(signal2[i], amp2, 15);
*out++ = c16mulRealShift(signal1[i], amp, 15);
}
return sz / 2;
}
static inline int dmrs_case00(c16_t *output,
c16_t *txl,
c16_t *mod_dmrs,
const int amp_dmrs,
const int amp,
int sz,
int start_sc,
int remaining_re,
int dmrs_port,
const int dmrs_Type,
int symbol_sz,
int l_prime,
uint8_t numDmrsCdmGrpsNoData)
{
// DMRS params for this dmrs port
int Wt[2], Wf[2];
get_Wt(Wt, dmrs_port, dmrs_Type);
get_Wf(Wf, dmrs_port, dmrs_Type);
const int8_t delta = get_delta(dmrs_port, dmrs_Type);
int dmrs_idx = 0;
int k = start_sc;
c16_t *in = txl;
uint8_t k_prime = 0;
uint16_t n = 0;
for (int i = 0; i < sz; i++) {
if (k == ((start_sc + get_dmrs_freq_idx(n, k_prime, delta, dmrs_Type)) % (symbol_sz))) {
output[k] = c16mulRealShift(mod_dmrs[dmrs_idx], Wt[l_prime] * Wf[k_prime] * amp_dmrs, 15);
dmrs_idx++;
k_prime = (k_prime + 1) & 1;
n += (k_prime ? 0 : 1);
}
/* Map PTRS Symbol */
/* Map DATA Symbol */
else if (allowed_xlsch_re_in_dmrs_symbol(k, start_sc, symbol_sz, numDmrsCdmGrpsNoData, dmrs_Type)) {
output[k] = c16mulRealShift(*in++, amp, 15);
}
/* mute RE */
else {
output[k] = (c16_t){0};
}
k = (k + 1) % symbol_sz;
} // RE loop
return in - txl;
}
static inline int no_ptrs_dmrs_case(c16_t *output, c16_t *txl, const int amp, const int sz)
{
// Loop Over SCs:
int i = 0;
#if defined(__AVX512BW__)
simde__m512i amp512 = simde_mm512_set1_epi16(amp);
for (; i < (sz & ~15); i += 16) {
const simde__m512i txL = simde_mm512_loadu_si512((simde__m512i *)(txl + i));
simde_mm512_storeu_si512((simde__m512i *)(output + i), simde_mm512_mulhrs_epi16(amp512, txL));
}
#endif
#if defined(__AVX2__)
simde__m256i amp256 = simde_mm256_set1_epi16(amp);
for (; i < (sz & ~7); i += 8) {
const simde__m256i txL = simde_mm256_loadu_si256((simde__m256i *)(txl + i));
simde_mm256_storeu_si256((simde__m256i *)(output + i), _mm256_mulhrs_epi16(amp256, txL));
}
#endif
#if defined(USE128BIT)
simde__m128i amp128 = simde_mm_set1_epi16(amp);
for (; i < (sz & ~3); i += 4) {
const simde__m128i txL = simde_mm_loadu_si128((simde__m128i *)(txl + i));
simde_mm_storeu_si128((simde__m128i *)(output + i), simde_mm_mulhrs_epi16(amp128, txL));
}
#endif
for (; i < sz; i++) {
output[i] = c16mulRealShift(txl[i], amp, 15);
}
return sz;
}
static inline void neg_dmrs(c16_t *in, c16_t *out, int sz)
{
for (int i = 0; i < sz; i++)
*out++ = i % 2 ? (c16_t){-in[i].r, -in[i].i} : in[i];
}
static inline int do_onelayer(NR_DL_FRAME_PARMS *frame_parms,
int slot,
nfapi_nr_dl_tti_pdsch_pdu_rel15_t *rel15,
int layer,
c16_t *output,
c16_t *txl_start,
int start_sc,
int symbol_sz,
int l_symbol,
uint16_t dlPtrsSymPos,
int n_ptrs,
int amp,
int amp_dmrs,
int l_prime,
nfapi_nr_dmrs_type_e dmrs_Type,
c16_t *dmrs_start)
{
c16_t *txl = txl_start;
const uint sz = rel15->rbSize * NR_NB_SC_PER_RB;
int upper_limit = sz;
int remaining_re = 0;
if (start_sc + upper_limit > symbol_sz) {
upper_limit = symbol_sz - start_sc;
remaining_re = sz - upper_limit;
}
/* calculate if current symbol is PTRS symbols */
int ptrs_symbol = 0;
if (rel15->pduBitmap & 0x1) {
ptrs_symbol = is_ptrs_symbol(l_symbol, dlPtrsSymPos);
}
if (ptrs_symbol) {
/* PTRS QPSK Modulation for each OFDM symbol in a slot */
LOG_D(PHY, "Doing ptrs modulation for symbol %d, n_ptrs %d\n", l_symbol, n_ptrs);
c16_t mod_ptrs[max(n_ptrs, 1)]
__attribute__((aligned(64))); // max only to please sanitizer, that kills if 0 even if it is not a error
const uint32_t *gold =
nr_gold_pdsch(frame_parms->N_RB_DL, frame_parms->symbols_per_slot, rel15->dlDmrsScramblingId, rel15->SCID, slot, l_symbol);
nr_modulation(gold, n_ptrs * DMRS_MOD_ORDER, DMRS_MOD_ORDER, (int16_t *)mod_ptrs);
txl += do_ptrs_symbol(rel15, start_sc, symbol_sz, output, txl, amp, mod_ptrs);
} else if (rel15->dlDmrsSymbPos & (1 << l_symbol)) {
/* Map DMRS Symbol */
int dmrs_port = get_dmrs_port(layer, rel15->dmrsPorts);
if (l_prime == 0 && dmrs_Type == NFAPI_NR_DMRS_TYPE1) {
if (rel15->numDmrsCdmGrpsNoData == 2) {
switch (dmrs_port & 3) {
case 0:
txl += interleave_with_0_signal_first(output + start_sc, dmrs_start, amp_dmrs, upper_limit);
txl += interleave_with_0_signal_first(output, dmrs_start + upper_limit / 2, amp_dmrs, remaining_re);
break;
case 1: {
c16_t dmrs[sz / 2];
neg_dmrs(dmrs_start, dmrs, sz / 2);
txl += interleave_with_0_signal_first(output + start_sc, dmrs, amp_dmrs, upper_limit);
txl += interleave_with_0_signal_first(output, dmrs + upper_limit / 2, amp_dmrs, remaining_re);
} break;
case 2:
txl += interleave_with_0_start_with_0(output + start_sc, dmrs_start, amp_dmrs, upper_limit);
txl += interleave_with_0_start_with_0(output, dmrs_start + upper_limit / 2, amp_dmrs, remaining_re);
break;
case 3: {
c16_t dmrs[sz / 2];
neg_dmrs(dmrs_start, dmrs, sz / 2);
txl += interleave_with_0_start_with_0(output + start_sc, dmrs, amp_dmrs, upper_limit);
txl += interleave_with_0_start_with_0(output, dmrs + upper_limit / 2, amp_dmrs, remaining_re);
} break;
}
} else if (rel15->numDmrsCdmGrpsNoData == 1) {
switch (dmrs_port & 3) {
case 0:
txl += interleave_signals(output + start_sc, txl, amp, dmrs_start, amp_dmrs, upper_limit);
txl += interleave_signals(output, txl, amp, dmrs_start + upper_limit / 2, amp_dmrs, remaining_re);
break;
case 1: {
c16_t dmrs[sz / 2];
neg_dmrs(dmrs_start, dmrs, sz / 2);
txl += interleave_signals(output + start_sc, txl, amp, dmrs, amp_dmrs, upper_limit);
txl += interleave_signals(output, txl, amp, dmrs + upper_limit / 2, amp_dmrs, remaining_re);
} break;
case 2:
txl += interleave_signals(output + start_sc, dmrs_start, amp_dmrs, txl, amp, upper_limit);
txl += interleave_signals(output, dmrs_start + upper_limit / 2, amp_dmrs, txl, amp, remaining_re);
break;
case 3: {
c16_t dmrs[sz / 2];
neg_dmrs(dmrs_start, dmrs, sz / 2);
txl += interleave_signals(output + start_sc, dmrs, amp_dmrs, txl, amp, upper_limit);
txl += interleave_signals(output, dmrs + upper_limit / 2, amp_dmrs, txl, amp, remaining_re);
} break;
}
} else
AssertFatal(false, "rel15->numDmrsCdmGrpsNoData is %d\n", rel15->numDmrsCdmGrpsNoData);
} else {
txl += dmrs_case00(output,
txl,
dmrs_start,
amp_dmrs,
amp,
sz,
start_sc,
remaining_re,
dmrs_port,
dmrs_Type,
symbol_sz,
l_prime,
rel15->numDmrsCdmGrpsNoData);
} // generic DMRS case
} else { // no PTRS or DMRS in this symbol
txl += no_ptrs_dmrs_case(output + start_sc, txl, amp, upper_limit);
txl += no_ptrs_dmrs_case(output, txl, amp, remaining_re);
} // no DMRS/PTRS in symbol
return txl - txl_start;
}
static inline void do_txdataF(c16_t **txdataF,
int symbol_sz,
c16_t txdataF_precoding[][symbol_sz],
PHY_VARS_gNB *gNB,
nfapi_nr_dl_tti_pdsch_pdu_rel15_t *rel15,
int ant,
int start_sc,
int txdataF_offset_per_symbol)
{
NR_DL_FRAME_PARMS *frame_parms = &gNB->frame_parms;
int rb = 0;
uint16_t subCarrier = start_sc;
nfapi_nr_tx_precoding_and_beamforming_t *pb = &rel15->precodingAndBeamforming;
while (rb < rel15->rbSize) {
// get pmi info
const int pmi = (pb->num_prgs > 0 && pb->prg_size > 0) ? (pb->prgs_list[(int)rb / pb->prg_size].pm_idx) : 0;
const int pmi2 = (rb < (rel15->rbSize - 1) && pb->prg_size > 0) ? (pb->prgs_list[(int)(rb + 1) / pb->prg_size].pm_idx) : -1;
// If pmi of next RB and pmi of current RB are the same, we do 2 RB in a row
// if pmi differs, or current rb is the end (rel15->rbSize - 1), than we do 1 RB in a row
const int rb_step = pmi == pmi2 ? 2 : 1;
const int re_cnt = NR_NB_SC_PER_RB * rb_step;
if (pmi == 0) { // unitary Precoding
if (subCarrier + re_cnt <= symbol_sz) { // RB does not cross DC
if (ant < rel15->nrOfLayers)
memcpy(&txdataF[ant][txdataF_offset_per_symbol + subCarrier],
&txdataF_precoding[ant][subCarrier],
re_cnt * sizeof(**txdataF));
else
memset(&txdataF[ant][txdataF_offset_per_symbol + subCarrier], 0, re_cnt * sizeof(**txdataF));
} else { // RB does cross DC
const int neg_length = symbol_sz - subCarrier;
const int pos_length = re_cnt - neg_length;
if (ant < rel15->nrOfLayers) {
memcpy(&txdataF[ant][txdataF_offset_per_symbol + subCarrier],
&txdataF_precoding[ant][subCarrier],
neg_length * sizeof(**txdataF));
memcpy(&txdataF[ant][txdataF_offset_per_symbol], &txdataF_precoding[ant], pos_length * sizeof(**txdataF));
} else {
memset(&txdataF[ant][txdataF_offset_per_symbol + subCarrier], 0, neg_length * sizeof(**txdataF));
memset(&txdataF[ant][txdataF_offset_per_symbol], 0, pos_length * sizeof(**txdataF));
}
}
subCarrier += re_cnt;
if (subCarrier >= symbol_sz) {
subCarrier -= symbol_sz;
}
} else { // non-unitary Precoding
AssertFatal(frame_parms->nb_antennas_tx > 1, "No precoding can be done with a single antenna port\n");
// get the precoding matrix weights:
nfapi_nr_pm_pdu_t *pmi_pdu = &gNB->gNB_config.pmi_list.pmi_pdu[pmi - 1]; // pmi 0 is identity matrix
AssertFatal(pmi == pmi_pdu->pm_idx, "PMI %d doesn't match to the one in precoding matrix %d\n", pmi, pmi_pdu->pm_idx);
AssertFatal(ant < pmi_pdu->num_ant_ports,
"Antenna port index %d exceeds precoding matrix AP size %d\n",
ant,
pmi_pdu->num_ant_ports);
AssertFatal(rel15->nrOfLayers == pmi_pdu->numLayers,
"Number of layers %d doesn't match to the one in precoding matrix %d\n",
rel15->nrOfLayers,
pmi_pdu->numLayers);
if ((subCarrier + re_cnt) < symbol_sz) { // within ofdm_symbol_size, use SIMDe
nr_layer_precoder_simd(rel15->nrOfLayers,
symbol_sz,
txdataF_precoding,
ant,
pmi_pdu,
subCarrier,
re_cnt,
&txdataF[ant][txdataF_offset_per_symbol]);
subCarrier += re_cnt;
} else { // crossing ofdm_symbol_size, use simple arithmetic operations
for (int i = 0; i < re_cnt; i++) {
txdataF[ant][txdataF_offset_per_symbol + subCarrier] =
nr_layer_precoder_cm(rel15->nrOfLayers, symbol_sz, txdataF_precoding, ant, pmi_pdu, subCarrier);
#ifdef DEBUG_DLSCH_MAPPING
printf("antenna %d\t l %d \t subCarrier %d \t txdataF: %d %d\n",
ant,
l_symbol,
subCarrier,
txdataF[ant][l_symbol * symbol_sz + subCarrier + txdataF_offset].r,
txdataF[ant][l_symbol * symbol_sz + subCarrier + txdataF_offset].i);
#endif
if (++subCarrier >= symbol_sz) {
subCarrier -= symbol_sz;
}
}
} // else{ // crossing ofdm_symbol_size, use simple arithmetic operations
} // else { // non-unitary Precoding
rb += rb_step;
} // RB loop: while(rb < rel15->rbSize)
}
static int do_one_dlsch(unsigned char *input_ptr, PHY_VARS_gNB *gNB, NR_gNB_DLSCH_t *dlsch, int slot)
{
const int16_t amp = gNB->TX_AMP;
NR_DL_FRAME_PARMS *frame_parms = &gNB->frame_parms;
time_stats_t *dlsch_scrambling_stats = &gNB->dlsch_scrambling_stats;
time_stats_t *dlsch_modulation_stats = &gNB->dlsch_modulation_stats;
NR_DL_gNB_HARQ_t *harq = &dlsch->harq_process;
nfapi_nr_dl_tti_pdsch_pdu_rel15_t *rel15 = &harq->pdsch_pdu.pdsch_pdu_rel15;
const int layerSz = frame_parms->N_RB_DL * NR_SYMBOLS_PER_SLOT * NR_NB_SC_PER_RB;
const int symbol_sz=frame_parms->ofdm_symbol_size;
const int dmrs_Type = rel15->dmrsConfigType;
const int nb_re_dmrs = rel15->numDmrsCdmGrpsNoData * (rel15->dmrsConfigType == NFAPI_NR_DMRS_TYPE1 ? 6 : 4);
const int amp_dmrs = (int)((double)amp * sqrt(rel15->numDmrsCdmGrpsNoData)); // 3GPP TS 38.214 Section 4.1: Table 4.1-1
LOG_D(PHY,
"pdsch: BWPStart %d, BWPSize %d, rbStart %d, rbsize %d\n",
rel15->BWPStart,
rel15->BWPSize,
rel15->rbStart,
rel15->rbSize);
const int n_dmrs = (rel15->BWPStart + rel15->rbStart + rel15->rbSize) * nb_re_dmrs;
const int dmrs_symbol_map = rel15->dlDmrsSymbPos; // single DMRS: 010000100 Double DMRS 110001100
const int xOverhead = 0;
const int nb_re =
(12 * rel15->NrOfSymbols - nb_re_dmrs * get_num_dmrs(rel15->dlDmrsSymbPos) - xOverhead) * rel15->rbSize * rel15->nrOfLayers;
const int Qm = rel15->qamModOrder[0];
const int encoded_length = nb_re * Qm;
/* PTRS */
uint16_t dlPtrsSymPos = 0;
int n_ptrs = 0;
uint32_t ptrsSymbPerSlot = 0;
if (rel15->pduBitmap & 0x1) {
set_ptrs_symb_idx(&dlPtrsSymPos,
rel15->NrOfSymbols,
rel15->StartSymbolIndex,
1 << rel15->PTRSTimeDensity,
rel15->dlDmrsSymbPos);
n_ptrs = (rel15->rbSize + rel15->PTRSFreqDensity - 1) / rel15->PTRSFreqDensity;
ptrsSymbPerSlot = get_ptrs_symbols_in_slot(dlPtrsSymPos, rel15->StartSymbolIndex, rel15->NrOfSymbols);
}
harq->unav_res = ptrsSymbPerSlot * n_ptrs;
#ifdef DEBUG_DLSCH
printf("PDSCH encoding:\nPayload:\n");
for (int i = 0; i < (harq->B >> 3); i += 16) {
for (int j = 0; j < 16; j++)
printf("0x%02x\t", harq->pdu[i + j]);
printf("\n");
}
printf("\nEncoded payload:\n");
for (int i = 0; i < encoded_length; i += 8) {
for (int j = 0; j < 8; j++)
printf("%d", input_ptr[i + j]);
printf("\t");
}
printf("\n");
#endif
if (IS_SOFTMODEM_DLSIM)
memcpy(harq->f, input_ptr, encoded_length);
c16_t mod_symbs[rel15->NrOfCodewords][encoded_length] __attribute__((aligned(64)));
for (int codeWord = 0; codeWord < rel15->NrOfCodewords; codeWord++) {
/// scrambling
start_meas(dlsch_scrambling_stats);
uint32_t scrambled_output[(encoded_length >> 5) + 4]; // modulator acces by 4 bytes in some cases
memset(scrambled_output, 0, sizeof(scrambled_output));
nr_pdsch_codeword_scrambling(input_ptr, encoded_length, codeWord, rel15->dataScramblingId, rel15->rnti, scrambled_output);
#ifdef DEBUG_DLSCH
printf("PDSCH scrambling:\n");
for (int i = 0; i < encoded_length >> 8; i++) {
for (int j = 0; j < 8; j++)
printf("0x%08x\t", scrambled_output[(i << 3) + j]);
printf("\n");
}
#endif
stop_meas(dlsch_scrambling_stats);
/// Modulation
start_meas(dlsch_modulation_stats);
nr_modulation(scrambled_output, encoded_length, Qm, (int16_t *)mod_symbs[codeWord]);
VCD_SIGNAL_DUMPER_DUMP_FUNCTION_BY_NAME(VCD_SIGNAL_DUMPER_FUNCTIONS_gNB_PDSCH_MODULATION, 0);
stop_meas(dlsch_modulation_stats);
#ifdef DEBUG_DLSCH
printf("PDSCH Modulation: Qm %d(%d)\n", Qm, nb_re);
for (int i = 0; i < nb_re; i += 8) {
for (int j = 0; j < 8; j++) {
printf("%d %d\t", mod_symbs[codeWord][i + j].r, mod_symbs[codeWord][i + j].i);
}
printf("\n");
}
#endif
}
/// Resource mapping
// Non interleaved VRB to PRB mapping
uint16_t start_sc = frame_parms->first_carrier_offset + (rel15->rbStart + rel15->BWPStart) * NR_NB_SC_PER_RB;
if (start_sc >= symbol_sz)
start_sc -= symbol_sz;
const uint32_t txdataF_offset = slot * frame_parms->samples_per_slot_wCP;
#ifdef DEBUG_DLSCH_MAPPING
printf("PDSCH resource mapping started (start SC %d\tstart symbol %d\tN_PRB %d\tnb_re %d,nb_layers %d)\n",
start_sc,
rel15->StartSymbolIndex,
rel15->rbSize,
nb_re,
rel15->nrOfLayers);
#endif
AssertFatal(n_dmrs, "n_dmrs can't be 0\n");
// make a large enough tail to process all re with SIMD regardless a garbadge filler
c16_t mod_dmrs[(n_dmrs+63)&~63] __attribute__((aligned(64)));
unsigned int re_beginning_of_symbol = 0;
int layerSz2 = (layerSz + 63) & ~63;
c16_t tx_layers[rel15->nrOfLayers][layerSz2] __attribute__((aligned(64)));
memset(tx_layers, 0, sizeof(tx_layers));
nr_layer_mapping(rel15->NrOfCodewords, encoded_length, mod_symbs, rel15->nrOfLayers, layerSz2, nb_re, tx_layers);
/// Layer Precoding and Antenna port mapping
// tx_layers 1-8 are mapped on antenna ports 1000-1007
// The precoding info is supported by nfapi such as num_prgs, prg_size, prgs_list and pm_idx
// The same precoding matrix is applied on prg_size RBs, Thus
// pmi = prgs_list[rbidx/prg_size].pm_idx, rbidx =0,...,rbSize-1
// The Precoding matrix:
// The Codebook Type I
start_meas(&gNB->dlsch_precoding_stats);
nfapi_nr_tx_precoding_and_beamforming_t *pb = &rel15->precodingAndBeamforming;
// beam number in multi-beam scenario (concurrent beams)
int bitmap = SL_to_bitmap(rel15->StartSymbolIndex, rel15->NrOfSymbols);
int beam_nb = beam_index_allocation(pb->prgs_list[0].dig_bf_interface_list[0].beam_idx,
&gNB->common_vars,
slot,
frame_parms->symbols_per_slot,
bitmap);
c16_t **txdataF = gNB->common_vars.txdataF[beam_nb];
// Loop Over OFDM symbols:
for (int l_symbol = rel15->StartSymbolIndex; l_symbol < rel15->StartSymbolIndex + rel15->NrOfSymbols; l_symbol++) {
int l_prime = 0; // single symbol layer 0
int l_overline = get_l0(rel15->dlDmrsSymbPos);
#ifdef DEBUG_DLSCH_MAPPING
printf("PDSCH resource mapping symbol %d\n", l_symbol);
#endif
/// DMRS QPSK modulation
if ((dmrs_symbol_map & (1 << l_symbol))) { // DMRS time occasion
// The reference point for is subcarrier -1 of the lowest-numbered resource block in CORESET 0 if the corresponding
// PDCCH is associated with CORESET -1 and Type0-PDCCH common search space and is addressed to SI-RNTI
// 2GPP TS 38.211 V15.8.0 Section 7.4.1.1.2 Mapping to physical resources
if (l_symbol == (l_overline + 1)) // take into account the double DMRS symbols
l_prime = 1;
else if (l_symbol > (l_overline + 1)) { // new DMRS pair
l_overline = l_symbol;
l_prime = 0;
}
#ifdef DEBUG_DLSCH_MAPPING
printf("dlDmrsScramblingId %d, SCID %d slot %d l_symbol %d\n", rel15->dlDmrsScramblingId, rel15->SCID, slot, l_symbol);
#endif
const uint32_t *gold = nr_gold_pdsch(frame_parms->N_RB_DL,
frame_parms->symbols_per_slot,
rel15->dlDmrsScramblingId,
rel15->SCID,
slot,
l_symbol);
// Qm = 1 as DMRS is QPSK modulated
nr_modulation(gold, n_dmrs * DMRS_MOD_ORDER, DMRS_MOD_ORDER, (int16_t *)mod_dmrs);
#ifdef DEBUG_DLSCH_MAPPING
printf("DMRS modulation (symbol %d, %d symbols, type %d):\n", l_symbol, n_dmrs, dmrs_Type);
for (int i = 0; i < n_dmrs / 2; i += 8) {
for (int j = 0; j < 8; j++) {
printf("%d %d\t", mod_dmrs[i + j].r, mod_dmrs[i + j].i);
}
printf("\n");
}
#endif
}
uint32_t dmrs_idx = rel15->rbStart;
if (rel15->rnti != SI_RNTI)
dmrs_idx += rel15->BWPStart;
dmrs_idx *= dmrs_Type == NFAPI_NR_DMRS_TYPE1 ? 6 : 4;
c16_t txdataF_precoding[rel15->nrOfLayers][symbol_sz] __attribute__((aligned(64)));
int layer_sz = 0;
for (int layer = 0; layer < rel15->nrOfLayers; layer++) {
layer_sz = do_onelayer(frame_parms,
slot,
rel15,
layer,
txdataF_precoding[layer],
tx_layers[layer] + re_beginning_of_symbol,
start_sc,
symbol_sz,
l_symbol,
dlPtrsSymPos,
n_ptrs,
amp,
amp_dmrs,
l_prime,
dmrs_Type,
mod_dmrs + dmrs_idx);
} // layer loop
re_beginning_of_symbol += layer_sz;
stop_meas(&gNB->dlsch_resource_mapping_stats);
for (int ant = 0; ant < frame_parms->nb_antennas_tx; ant++) {
const size_t txdataF_offset_per_symbol = l_symbol * symbol_sz + txdataF_offset;
do_txdataF(txdataF, symbol_sz, txdataF_precoding, gNB, rel15, ant, start_sc, txdataF_offset_per_symbol);
}
}
stop_meas(&gNB->dlsch_precoding_stats);
/* output and its parts for each dlsch should be aligned on 64 bytes
* should remain a multiple of 64 with enough offset to fit each dlsch
*/
uint32_t size_output_tb = rel15->rbSize * NR_SYMBOLS_PER_SLOT * NR_NB_SC_PER_RB * Qm * rel15->nrOfLayers;
return ceil_mod(size_output_tb,64);
}
void nr_generate_pdsch(processingData_L1tx_t *msgTx, int frame, int slot)
{
PHY_VARS_gNB *gNB = msgTx->gNB;
const int16_t amp = gNB->TX_AMP;
NR_DL_FRAME_PARMS *frame_parms = &gNB->frame_parms;
time_stats_t *dlsch_encoding_stats=&gNB->dlsch_encoding_stats;
time_stats_t *dlsch_scrambling_stats=&gNB->dlsch_scrambling_stats;
time_stats_t *dlsch_modulation_stats=&gNB->dlsch_modulation_stats;
time_stats_t *tinput=&gNB->tinput;
time_stats_t *tprep=&gNB->tprep;
time_stats_t *tparity=&gNB->tparity;
time_stats_t *toutput=&gNB->toutput;
time_stats_t *dlsch_rate_matching_stats=&gNB->dlsch_rate_matching_stats;
time_stats_t *dlsch_interleaving_stats=&gNB->dlsch_interleaving_stats;
time_stats_t *dlsch_segmentation_stats=&gNB->dlsch_segmentation_stats;
time_stats_t *dlsch_encoding_stats = &gNB->dlsch_encoding_stats;
time_stats_t *tinput = &gNB->tinput;
time_stats_t *tprep = &gNB->tprep;
time_stats_t *tparity = &gNB->tparity;
time_stats_t *toutput = &gNB->toutput;
time_stats_t *dlsch_rate_matching_stats = &gNB->dlsch_rate_matching_stats;
time_stats_t *dlsch_interleaving_stats = &gNB->dlsch_interleaving_stats;
time_stats_t *dlsch_segmentation_stats = &gNB->dlsch_segmentation_stats;
size_t size_output = 0;
for (int dlsch_id=0; dlsch_id<msgTx->num_pdsch_slot; dlsch_id++) {
for (int dlsch_id = 0; dlsch_id < msgTx->num_pdsch_slot; dlsch_id++) {
NR_gNB_DLSCH_t *dlsch = msgTx->dlsch[dlsch_id];
NR_DL_gNB_HARQ_t *harq = &dlsch->harq_process;
nfapi_nr_dl_tti_pdsch_pdu_rel15_t *rel15 = &harq->pdsch_pdu.pdsch_pdu_rel15;
LOG_D(PHY,"pdsch: BWPStart %d, BWPSize %d, rbStart %d, rbsize %d\n",
rel15->BWPStart,rel15->BWPSize,rel15->rbStart,rel15->rbSize);
LOG_D(PHY,
"pdsch: BWPStart %d, BWPSize %d, rbStart %d, rbsize %d\n",
rel15->BWPStart,
rel15->BWPSize,
rel15->rbStart,
rel15->rbSize);
const int Qm = rel15->qamModOrder[0];
......@@ -81,7 +803,7 @@ void nr_generate_pdsch(processingData_L1tx_t *msgTx, int frame, int slot)
uint16_t dlPtrsSymPos = 0;
int n_ptrs = 0;
uint32_t ptrsSymbPerSlot = 0;
if(rel15->pduBitmap & 0x1) {
if (rel15->pduBitmap & 0x1) {
set_ptrs_symb_idx(&dlPtrsSymPos,
rel15->NrOfSymbols,
rel15->StartSymbolIndex,
......@@ -103,11 +825,8 @@ void nr_generate_pdsch(processingData_L1tx_t *msgTx, int frame, int slot)
}
unsigned char output[size_output] __attribute__((aligned(64)));
bzero(output, size_output);
size_t offset_output = 0;
start_meas(dlsch_encoding_stats);
if (nr_dlsch_encoding(gNB,
msgTx,
......@@ -121,519 +840,20 @@ void nr_generate_pdsch(processingData_L1tx_t *msgTx, int frame, int slot)
toutput,
dlsch_rate_matching_stats,
dlsch_interleaving_stats,
dlsch_segmentation_stats) == -1) {
dlsch_segmentation_stats)
== -1) {
return;
}
stop_meas(dlsch_encoding_stats);
for (int dlsch_id=0; dlsch_id<msgTx->num_pdsch_slot; dlsch_id++) {
NR_gNB_DLSCH_t *dlsch = msgTx->dlsch[dlsch_id];
NR_DL_gNB_HARQ_t *harq = &dlsch->harq_process;
nfapi_nr_dl_tti_pdsch_pdu_rel15_t *rel15 = &harq->pdsch_pdu.pdsch_pdu_rel15;
const int layerSz = frame_parms->N_RB_DL * NR_SYMBOLS_PER_SLOT * NR_NB_SC_PER_RB;
const int dmrs_Type = rel15->dmrsConfigType;
const int nb_re_dmrs = rel15->numDmrsCdmGrpsNoData * (rel15->dmrsConfigType == NFAPI_NR_DMRS_TYPE1 ? 6 : 4);
const int amp_dmrs = (int)((double)amp * sqrt(rel15->numDmrsCdmGrpsNoData)); // 3GPP TS 38.214 Section 4.1: Table 4.1-1
LOG_D(PHY,"pdsch: BWPStart %d, BWPSize %d, rbStart %d, rbsize %d\n",
rel15->BWPStart,rel15->BWPSize,rel15->rbStart,rel15->rbSize);
const int n_dmrs = (rel15->BWPStart + rel15->rbStart + rel15->rbSize) * nb_re_dmrs;
const int dmrs_symbol_map = rel15->dlDmrsSymbPos; // single DMRS: 010000100 Double DMRS 110001100
const int xOverhead = 0;
const int nb_re =
(12 * rel15->NrOfSymbols - nb_re_dmrs * get_num_dmrs(rel15->dlDmrsSymbPos) - xOverhead) * rel15->rbSize * rel15->nrOfLayers;
const int Qm = rel15->qamModOrder[0];
const int encoded_length = nb_re * Qm;
/* PTRS */
uint16_t dlPtrsSymPos = 0;
int n_ptrs = 0;
uint32_t ptrsSymbPerSlot = 0;
if(rel15->pduBitmap & 0x1) {
set_ptrs_symb_idx(&dlPtrsSymPos,
rel15->NrOfSymbols,
rel15->StartSymbolIndex,
1 << rel15->PTRSTimeDensity,
rel15->dlDmrsSymbPos);
n_ptrs = (rel15->rbSize + rel15->PTRSFreqDensity - 1) / rel15->PTRSFreqDensity;
ptrsSymbPerSlot = get_ptrs_symbols_in_slot(dlPtrsSymPos, rel15->StartSymbolIndex, rel15->NrOfSymbols);
}
harq->unav_res = ptrsSymbPerSlot * n_ptrs;
#ifdef DEBUG_DLSCH
printf("PDSCH encoding:\nPayload:\n");
for (int i = 0; i < (harq->B>>3); i += 16) {
for (int j=0; j < 16; j++)
printf("0x%02x\t", harq->pdu[i + j]);
printf("\n");
}
printf("\nEncoded payload:\n");
for (int i = 0; i < encoded_length; i += 8) {
for (int j = 0; j < 8; j++)
printf("%d", output[offset_output + i + j]);
printf("\t");
}
printf("\n");
#endif
if (IS_SOFTMODEM_DLSIM)
memcpy(harq->f, &output[offset_output], encoded_length);
c16_t mod_symbs[rel15->NrOfCodewords][encoded_length];
for (int codeWord = 0; codeWord < rel15->NrOfCodewords; codeWord++) {
/// scrambling
start_meas(dlsch_scrambling_stats);
uint32_t scrambled_output[(encoded_length>>5)+4]; // modulator acces by 4 bytes in some cases
memset(scrambled_output, 0, sizeof(scrambled_output));
if ( encoded_length > rel15->rbSize * NR_SYMBOLS_PER_SLOT * NR_NB_SC_PER_RB * Qm * rel15->nrOfLayers) abort();
nr_pdsch_codeword_scrambling(&output[offset_output], encoded_length, codeWord, rel15->dataScramblingId, rel15->rnti, scrambled_output);
#ifdef DEBUG_DLSCH
printf("PDSCH scrambling:\n");
for (int i=0; i<encoded_length>>8; i++) {
for (int j=0; j<8; j++)
printf("0x%08x\t", scrambled_output[(i<<3)+j]);
printf("\n");
}
#endif
stop_meas(dlsch_scrambling_stats);
/// Modulation
start_meas(dlsch_modulation_stats);
nr_modulation(scrambled_output, encoded_length, Qm, (int16_t *)mod_symbs[codeWord]);
VCD_SIGNAL_DUMPER_DUMP_FUNCTION_BY_NAME(VCD_SIGNAL_DUMPER_FUNCTIONS_gNB_PDSCH_MODULATION, 0);
stop_meas(dlsch_modulation_stats);
#ifdef DEBUG_DLSCH
printf("PDSCH Modulation: Qm %d(%d)\n", Qm, nb_re);
for (int i = 0; i < nb_re; i += 8) {
for (int j=0; j<8; j++) {
printf("%d %d\t", mod_symbs[codeWord][i + j].r, mod_symbs[codeWord][i + j].i);
}
printf("\n");
}
#endif
}
/// Resource mapping
/* output and its parts for each dlsch should be aligned on 64 bytes
* => offset_output should remain a multiple of 64 with enough offset to fit each dlsch
*/
uint32_t size_output_tb = rel15->rbSize * NR_SYMBOLS_PER_SLOT * NR_NB_SC_PER_RB * Qm * rel15->nrOfLayers;
offset_output += ceil_mod(size_output_tb, 64);
// Non interleaved VRB to PRB mapping
uint16_t start_sc = frame_parms->first_carrier_offset + (rel15->rbStart+rel15->BWPStart)*NR_NB_SC_PER_RB;
if (start_sc >= frame_parms->ofdm_symbol_size)
start_sc -= frame_parms->ofdm_symbol_size;
const uint32_t txdataF_offset = slot * frame_parms->samples_per_slot_wCP;
c16_t txdataF_precoding[rel15->nrOfLayers][NR_NUMBER_OF_SYMBOLS_PER_SLOT][frame_parms->ofdm_symbol_size] __attribute__((aligned(64)));;
#ifdef DEBUG_DLSCH_MAPPING
printf("PDSCH resource mapping started (start SC %d\tstart symbol %d\tN_PRB %d\tnb_re %d,nb_layers %d)\n",
start_sc,
rel15->StartSymbolIndex,
rel15->rbSize,
nb_re,
rel15->nrOfLayers);
#endif
start_meas(&gNB->dlsch_resource_mapping_stats);
for (int layer = 0; layer < rel15->nrOfLayers; layer++) {
c16_t tx_layer[layerSz] __attribute__((aligned(64)));
nr_layer_mapping(rel15->NrOfCodewords, encoded_length, mod_symbs, rel15->nrOfLayers, layerSz, nb_re, tx_layer, layer);
int dmrs_port = get_dmrs_port(layer, rel15->dmrsPorts);
// DMRS params for this dmrs port
int Wt[2], Wf[2];
get_Wt(Wt, dmrs_port, dmrs_Type);
get_Wf(Wf, dmrs_port, dmrs_Type);
const int8_t delta = get_delta(dmrs_port, dmrs_Type);
int8_t l_prime = 0; // single symbol layer 0
int8_t l_overline = get_l0(rel15->dlDmrsSymbPos);
#ifdef DEBUG_DLSCH_MAPPING
uint8_t dmrs_symbol = l_overline + l_prime;
printf("DMRS Type %d params for layer %d: Wt %d %d \t Wf %d %d \t delta %d \t l_prime %d \t l0 %d\tDMRS symbol %d\n",
1 + dmrs_Type,
layer,
Wt[0],
Wt[1],
Wf[0],
Wf[1],
delta,
l_prime,
l_overline,
dmrs_symbol);
#endif
uint32_t cur_re = 0, dmrs_idx = 0;
AssertFatal(n_dmrs, "n_dmrs can't be 0\n");
c16_t mod_dmrs[n_dmrs] __attribute__((aligned(64)));
// Loop Over OFDM symbols:
for (int l_symbol = rel15->StartSymbolIndex; l_symbol < rel15->StartSymbolIndex + rel15->NrOfSymbols; l_symbol++) {
/// DMRS QPSK modulation
uint8_t k_prime = 0;
uint16_t n = 0;
if ((dmrs_symbol_map & (1 << l_symbol))) { // DMRS time occasion
// The reference point for is subcarrier 0 of the lowest-numbered resource block in CORESET 0 if the corresponding
// PDCCH is associated with CORESET 0 and Type0-PDCCH common search space and is addressed to SI-RNTI
// 3GPP TS 38.211 V15.8.0 Section 7.4.1.1.2 Mapping to physical resources
dmrs_idx = rel15->rbStart;
if (rel15->rnti != SI_RNTI)
dmrs_idx += rel15->BWPStart;
dmrs_idx *= dmrs_Type == NFAPI_NR_DMRS_TYPE1 ? 6 : 4;
if (l_symbol == (l_overline + 1)) // take into account the double DMRS symbols
l_prime = 1;
else if (l_symbol > (l_overline + 1)) { // new DMRS pair
l_overline = l_symbol;
l_prime = 0;
}
/// DMRS QPSK modulation
NR_DL_FRAME_PARMS *fp = &gNB->frame_parms;
const uint32_t *gold =
nr_gold_pdsch(fp->N_RB_DL, fp->symbols_per_slot, rel15->dlDmrsScramblingId, rel15->SCID, slot, l_symbol);
// Qm = 2 as DMRS is QPSK modulated
nr_modulation(gold, n_dmrs * DMRS_MOD_ORDER, DMRS_MOD_ORDER, (int16_t *)mod_dmrs);
#ifdef DEBUG_DLSCH
printf("DMRS modulation (symbol %d, %d symbols, type %d):\n", l_symbol, n_dmrs, dmrs_Type);
for (int i = 0; i < n_dmrs / 2; i += 8) {
for (int j = 0; j < 8; j++) {
printf("%d %d\t", mod_dmrs[i + j].r, mod_dmrs[i + j].i);
}
printf("\n");
}
#endif
}
/* calculate if current symbol is PTRS symbols */
int ptrs_idx = 0;
int ptrs_symbol = 0;
c16_t mod_ptrs[max(n_ptrs, 1)] __attribute__((aligned(64))); //max only to please sanitizer, that kills if 0 even if it is not a error
if(rel15->pduBitmap & 0x1) {
ptrs_symbol = is_ptrs_symbol(l_symbol, dlPtrsSymPos);
if(ptrs_symbol) {
/* PTRS QPSK Modulation for each OFDM symbol in a slot */
LOG_D(PHY, "Doing ptrs modulation for symbol %d, n_ptrs %d\n", l_symbol, n_ptrs);
NR_DL_FRAME_PARMS *fp = &gNB->frame_parms;
const uint32_t *gold =
nr_gold_pdsch(fp->N_RB_DL, fp->symbols_per_slot, rel15->dlDmrsScramblingId, rel15->SCID, slot, l_symbol);
nr_modulation(gold, n_ptrs * DMRS_MOD_ORDER, DMRS_MOD_ORDER, (int16_t *)mod_ptrs);
}
}
uint16_t k = start_sc;
if (ptrs_symbol || dmrs_symbol_map & (1 << l_symbol)) {
// Loop Over SCs:
for (int i=0; i<rel15->rbSize*NR_NB_SC_PER_RB; i++) {
/* check if cuurent RE is PTRS RE*/
uint8_t is_ptrs_re = 0;
/* check for PTRS symbol and set flag for PTRS RE */
if(ptrs_symbol){
is_ptrs_re = is_ptrs_subcarrier(k,
rel15->rnti,
rel15->PTRSFreqDensity,
rel15->rbSize,
rel15->PTRSReOffset,
start_sc,
frame_parms->ofdm_symbol_size);
}
/* Map DMRS Symbol */
if ((dmrs_symbol_map & (1 << l_symbol))
&& (k == ((start_sc + get_dmrs_freq_idx(n, k_prime, delta, dmrs_Type)) % (frame_parms->ofdm_symbol_size)))) {
txdataF_precoding[layer][l_symbol][k] = c16mulRealShift(mod_dmrs[dmrs_idx], Wt[l_prime] * Wf[k_prime] * amp_dmrs, 15);
#ifdef DEBUG_DLSCH_MAPPING
printf("dmrs_idx %u\t l %d \t k %d \t k_prime %d \t n %d \t txdataF: %d %d\n",
dmrs_idx,
l_symbol,
k,
k_prime,
n,
txdataF_precoding[layer][l_symbol][k].r,
txdataF_precoding[layer][l_symbol][k].i);
#endif
dmrs_idx++;
k_prime++;
k_prime&=1;
n+=(k_prime)?0:1;
}
/* Map PTRS Symbol */
else if (is_ptrs_re) {
uint16_t beta_ptrs = 1;
txdataF_precoding[layer][l_symbol][k] = c16mulRealShift(mod_ptrs[ptrs_idx], beta_ptrs * amp, 15);
#ifdef DEBUG_DLSCH_MAPPING
printf("ptrs_idx %d\t l %d \t k %d \t k_prime %d \t n %d \t txdataF: %d %d, mod_ptrs: %d %d\n",
ptrs_idx,
l_symbol,
k,
k_prime,
n,
txdataF_precoding[layer][l_symbol][k].r,
txdataF_precoding[layer][l_symbol][k].i,
mod_ptrs[ptrs_idx].r,
mod_ptrs[ptrs_idx].i);
#endif
ptrs_idx++;
}
/* Map DATA Symbol */
else if (ptrs_symbol
|| allowed_xlsch_re_in_dmrs_symbol(k,
start_sc,
frame_parms->ofdm_symbol_size,
rel15->numDmrsCdmGrpsNoData,
dmrs_Type)) {
txdataF_precoding[layer][l_symbol][k] = c16mulRealShift(tx_layer[cur_re], amp, 15);
#ifdef DEBUG_DLSCH_MAPPING
printf("re %u\t l %d \t k %d \t txdataF: %d %d\n",
cur_re,
l_symbol,
k,
txdataF_precoding[layer][l_symbol][k].r,
txdataF_precoding[layer][l_symbol][k].i);
#endif
cur_re++;
}
/* mute RE */
else {
txdataF_precoding[layer][l_symbol][k] = (c16_t){0};
}
if (++k >= frame_parms->ofdm_symbol_size)
k -= frame_parms->ofdm_symbol_size;
} //RE loop
} else { // no PTRS or DMRS in this symbol
// Loop Over SCs:
int upper_limit=rel15->rbSize*NR_NB_SC_PER_RB;
int remaining_re = 0;
if (start_sc + upper_limit > frame_parms->ofdm_symbol_size) {
remaining_re = upper_limit + start_sc - frame_parms->ofdm_symbol_size;
upper_limit = frame_parms->ofdm_symbol_size - start_sc;
}
// fix the alignment issues later, use 64-bit SIMD below instead of 128.
// can be made with loadu/storeu
if (0/*(frame_parms->N_RB_DL&1)==0*/) {
simde__m128i *txF = (simde__m128i *)&txdataF_precoding[layer][l_symbol][start_sc];
simde__m128i *txl = (simde__m128i *)&tx_layer[cur_re];
simde__m128i amp128=simde_mm_set1_epi16(amp);
for (int i=0; i<(upper_limit>>2); i++) {
txF[i] = simde_mm_mulhrs_epi16(amp128,txl[i]);
} //RE loop, first part
cur_re += upper_limit;
if (remaining_re > 0) {
txF = (simde__m128i *)&txdataF_precoding[layer][l_symbol];
txl = (simde__m128i *)&tx_layer[cur_re];
for (int i = 0; i < (remaining_re >> 2); i++) {
txF[i] = simde_mm_mulhrs_epi16(amp128, txl[i]);
}
}
}
else {
simde__m128i *txF = (simde__m128i *)&txdataF_precoding[layer][l_symbol][start_sc];
simde__m128i *txl = (simde__m128i *)&tx_layer[cur_re];
simde__m128i amp64 = simde_mm_set1_epi16(amp);
int i;
for (i = 0; i < (upper_limit >> 2); i++) {
const simde__m128i txL = simde_mm_loadu_si128(txl + i);
simde_mm_storeu_si128(txF + i, simde_mm_mulhrs_epi16(amp64, txL));
#ifdef DEBUG_DLSCH_MAPPING
for (int j = 0; j < 4; j++)
printf("re %u\t l %d \t k %d \t txdataF: %d %d\n",
cur_re + 4 * i + j,
l_symbol,
start_sc + 4 * i + j,
txdataF_precoding[layer][l_symbol][start_sc + 4 * i + j].r,
txdataF_precoding[layer][l_symbol][start_sc + 4 * i + j].i);
#endif
/* handle this, mute RE */
/*else {
txdataF_precoding[layer][((l*frame_parms->ofdm_symbol_size + k)<<1) ] = 0;
txdataF_precoding[layer][((l*frame_parms->ofdm_symbol_size + k)<<1) + 1] = 0;
}*/
}
if (i * 4 != upper_limit) {
c16_t *txFc = &txdataF_precoding[layer][l_symbol][start_sc];
c16_t *txlc = &tx_layer[cur_re];
for (i = (upper_limit >> 2) << 2; i < upper_limit; i++) {
txFc[i].r = (((txlc[i].r * amp) >> 14) + 1) >> 1;
txFc[i].i = (((txlc[i].i * amp) >> 14) + 1) >> 1;
#ifdef DEBUG_DLSCH_MAPPING
printf("re %u\t l %d \t k %d \t txdataF: %d %d\n", cur_re + i, l_symbol, start_sc + i, txFc[i].r, txFc[i].i);
#endif
}
}
cur_re += upper_limit;
if (remaining_re > 0) {
txF = (simde__m128i *)&txdataF_precoding[layer][l_symbol];
txl = (simde__m128i *)&tx_layer[cur_re];
int i;
for (i = 0; i < (remaining_re >> 2); i++) {
const simde__m128i txL = simde_mm_loadu_si128(txl + i);
simde_mm_storeu_si128(txF + i, simde_mm_mulhrs_epi16(amp64, txL));
#ifdef DEBUG_DLSCH_MAPPING
for (int j = 0; j < 4; j++)
printf("re %u\t l %d \t k %d \t txdataF: %d %d\n",
cur_re + 4 * i + j,
l_symbol,
4 * i + j,
txdataF_precoding[layer][l_symbol][4 * i + j].r,
txdataF_precoding[layer][l_symbol][4 * i + j].i);
#endif
/* handle this, mute RE */
/*else {
txdataF_precoding[layer][((l*frame_parms->ofdm_symbol_size + k)<<1) ] = 0;
txdataF_precoding[layer][((l*frame_parms->ofdm_symbol_size + k)<<1) + 1] = 0;
}*/
} // RE loop, second part
if (i * 4 != remaining_re) {
c16_t *txFc = txdataF_precoding[layer][l_symbol];
c16_t *txlc = &tx_layer[cur_re];
for (i = (remaining_re >> 2) << 2; i < remaining_re; i++) {
txFc[i].r = (((txlc[i].r * amp) >> 14) + 1) >> 1;
txFc[i].i = (((txlc[i].i * amp) >> 14) + 1) >> 1;
#ifdef DEBUG_DLSCH_MAPPING
printf("re %u\t l %d \t k %d \t txdataF: %d %d\n", cur_re + i, l_symbol, i, txFc[i].r, txFc[i].i);
#endif
}
}
} // remaining_re > 0
cur_re += remaining_re;
} // N_RB_DL even
} // no DMRS/PTRS in symbol
} // symbol loop
} // layer loop
stop_meas(&gNB->dlsch_resource_mapping_stats);
///Layer Precoding and Antenna port mapping
// tx_layers 1-8 are mapped on antenna ports 1000-1007
// The precoding info is supported by nfapi such as num_prgs, prg_size, prgs_list and pm_idx
// The same precoding matrix is applied on prg_size RBs, Thus
// pmi = prgs_list[rbidx/prg_size].pm_idx, rbidx =0,...,rbSize-1
// The Precoding matrix:
// The Codebook Type I
start_meas(&gNB->dlsch_precoding_stats);
nfapi_nr_tx_precoding_and_beamforming_t *pb = &rel15->precodingAndBeamforming;
// beam number in multi-beam scenario (concurrent beams)
int bitmap = SL_to_bitmap(rel15->StartSymbolIndex, rel15->NrOfSymbols);
int beam_nb = beam_index_allocation(pb->prgs_list[0].dig_bf_interface_list[0].beam_idx,
&gNB->common_vars,
slot,
frame_parms->symbols_per_slot,
bitmap);
c16_t **txdataF = gNB->common_vars.txdataF[beam_nb];
for (int ant = 0; ant < frame_parms->nb_antennas_tx; ant++) {
for (int l_symbol = rel15->StartSymbolIndex; l_symbol < rel15->StartSymbolIndex + rel15->NrOfSymbols; l_symbol++) {
uint16_t subCarrier = start_sc;
const size_t txdataF_offset_per_symbol = l_symbol * frame_parms->ofdm_symbol_size + txdataF_offset;
int rb = 0;
while(rb < rel15->rbSize) {
//get pmi info
const int pmi = (pb->num_prgs > 0 && pb->prg_size > 0) ? (pb->prgs_list[(int)rb / pb->prg_size].pm_idx) : 0;
const int pmi2 = (rb < (rel15->rbSize - 1) && pb->prg_size > 0) ? (pb->prgs_list[(int)(rb+1)/pb->prg_size].pm_idx) : -1;
// If pmi of next RB and pmi of current RB are the same, we do 2 RB in a row
// if pmi differs, or current rb is the end (rel15->rbSize - 1), than we do 1 RB in a row
const int rb_step = pmi == pmi2 ? 2 : 1;
const int re_cnt = NR_NB_SC_PER_RB * rb_step;
if (pmi == 0) {//unitary Precoding
if (subCarrier + re_cnt <= frame_parms->ofdm_symbol_size) { // RB does not cross DC
if (ant < rel15->nrOfLayers)
memcpy(&txdataF[ant][txdataF_offset_per_symbol + subCarrier],
&txdataF_precoding[ant][l_symbol][subCarrier],
re_cnt * sizeof(**txdataF));
else
memset(&txdataF[ant][txdataF_offset_per_symbol + subCarrier],
0,
re_cnt * sizeof(**txdataF));
} else { // RB does cross DC
const int neg_length = frame_parms->ofdm_symbol_size - subCarrier;
const int pos_length = re_cnt - neg_length;
if (ant < rel15->nrOfLayers) {
memcpy(&txdataF[ant][txdataF_offset_per_symbol + subCarrier],
&txdataF_precoding[ant][l_symbol][subCarrier],
neg_length * sizeof(**txdataF));
memcpy(&txdataF[ant][txdataF_offset_per_symbol],
&txdataF_precoding[ant][l_symbol],
pos_length * sizeof(**txdataF));
} else {
memset(&txdataF[ant][txdataF_offset_per_symbol + subCarrier],
0,
neg_length * sizeof(**txdataF));
memset(&txdataF[ant][txdataF_offset_per_symbol],
0,
pos_length * sizeof(**txdataF));
}
}
subCarrier += re_cnt;
if (subCarrier >= frame_parms->ofdm_symbol_size) {
subCarrier -= frame_parms->ofdm_symbol_size;
}
}
else { // non-unitary Precoding
AssertFatal(frame_parms->nb_antennas_tx > 1, "No precoding can be done with a single antenna port\n");
//get the precoding matrix weights:
nfapi_nr_pm_pdu_t *pmi_pdu = &gNB->gNB_config.pmi_list.pmi_pdu[pmi - 1]; // pmi 0 is identity matrix
AssertFatal(pmi == pmi_pdu->pm_idx, "PMI %d doesn't match to the one in precoding matrix %d\n",
pmi, pmi_pdu->pm_idx);
AssertFatal(ant < pmi_pdu->num_ant_ports, "Antenna port index %d exceeds precoding matrix AP size %d\n",
ant, pmi_pdu->num_ant_ports);
AssertFatal(rel15->nrOfLayers == pmi_pdu->numLayers, "Number of layers %d doesn't match to the one in precoding matrix %d\n",
rel15->nrOfLayers, pmi_pdu->numLayers);
if ((subCarrier + re_cnt) < frame_parms->ofdm_symbol_size) { // within ofdm_symbol_size, use SIMDe
nr_layer_precoder_simd(rel15->nrOfLayers,
NR_SYMBOLS_PER_SLOT,
frame_parms->ofdm_symbol_size,
txdataF_precoding,
ant,
pmi_pdu,
l_symbol,
subCarrier,
re_cnt,
&txdataF[ant][txdataF_offset_per_symbol]);
subCarrier += re_cnt;
} else { // crossing ofdm_symbol_size, use simple arithmetic operations
for (int i = 0; i < re_cnt; i++) {
txdataF[ant][txdataF_offset_per_symbol + subCarrier] =
nr_layer_precoder_cm(rel15->nrOfLayers,
NR_SYMBOLS_PER_SLOT,
frame_parms->ofdm_symbol_size,
txdataF_precoding,
ant,
pmi_pdu,
l_symbol,
subCarrier);
#ifdef DEBUG_DLSCH_MAPPING
printf("antenna %d\t l %d \t subCarrier %d \t txdataF: %d %d\n",
ant,
l_symbol,
subCarrier,
txdataF[ant][l_symbol * frame_parms->ofdm_symbol_size + subCarrier + txdataF_offset].r,
txdataF[ant][l_symbol * frame_parms->ofdm_symbol_size + subCarrier + txdataF_offset].i);
#endif
if (++subCarrier >= frame_parms->ofdm_symbol_size) {
subCarrier -= frame_parms->ofdm_symbol_size;
}
}
} // else{ // crossing ofdm_symbol_size, use simple arithmetic operations
} // else { // non-unitary Precoding
rb += rb_step;
} // RB loop: while(rb < rel15->rbSize)
} // symbol loop
} // port loop
stop_meas(&gNB->dlsch_precoding_stats);
}// dlsch loop
unsigned char *output_ptr = output;
for (int dlsch_id = 0; dlsch_id < msgTx->num_pdsch_slot; dlsch_id++) {
output_ptr += do_one_dlsch(output_ptr, gNB, msgTx->dlsch[dlsch_id], slot);
}
}
void dump_pdsch_stats(FILE *fd,PHY_VARS_gNB *gNB) {
void dump_pdsch_stats(FILE *fd, PHY_VARS_gNB *gNB)
{
for (int i = 0; i < MAX_MOBILES_PER_GNB; i++) {
NR_gNB_PHY_STATS_t *stats = &gNB->phy_stats[i];
if (stats->active && stats->frame != stats->dlsch_stats.dump_frame) {
......
......@@ -520,6 +520,8 @@ typedef struct PHY_VARS_gNB_s {
time_stats_t dlsch_interleaving_stats;
time_stats_t dlsch_segmentation_stats;
time_stats_t dci_generation_stats;
time_stats_t phase_comp_stats;
time_stats_t rx_pusch_stats;
time_stats_t rx_pusch_init_stats;
time_stats_t rx_pusch_symbol_processing_stats;
......
......@@ -290,6 +290,7 @@ void phy_procedures_gNB_TX(processingData_L1tx_t *msgTx,
//apply the OFDM symbol rotation here
if (gNB->phase_comp) {
start_meas(&gNB->phase_comp_stats);
for(int i = 0; i < gNB->common_vars.num_beams_period; ++i) {
for (int aa = 0; aa < cfg->carrier_config.num_tx_ant.value; aa++) {
apply_nr_rotation_TX(fp,
......@@ -304,6 +305,7 @@ void phy_procedures_gNB_TX(processingData_L1tx_t *msgTx,
T_INT(aa), T_BUFFER(&gNB->common_vars.txdataF[aa][txdataF_offset], fp->samples_per_slot_wCP*sizeof(int32_t)));
}
}
stop_meas(&gNB->phase_comp_stats);
}
VCD_SIGNAL_DUMPER_DUMP_FUNCTION_BY_NAME(VCD_SIGNAL_DUMPER_FUNCTIONS_PHY_PROCEDURES_gNB_TX + gNB->CC_id, 0);
......
......@@ -965,10 +965,12 @@ printf("%d\n", slot);
reset_meas(&gNB->dlsch_segmentation_stats);
reset_meas(&gNB->dlsch_modulation_stats);
reset_meas(&gNB->dlsch_encoding_stats);
reset_meas(&gNB->dci_generation_stats);
reset_meas(&gNB->tinput);
reset_meas(&gNB->tprep);
reset_meas(&gNB->tparity);
reset_meas(&gNB->toutput);
reset_meas(&gNB->phase_comp_stats);
uint32_t errors_scrambling[16] = {0};
int n_errors[16] = {0};
......@@ -1264,6 +1266,7 @@ printf("%d\n", slot);
UE->dl_harq_processes[0][slot].C,
msgDataTx->dlsch[0][0].harq_process.pdsch_pdu.pdsch_pdu_rel15.TBSize[0] << 3);
printDistribution(&gNB->phy_proc_tx,table_tx,"PHY proc tx");
printStatIndent2(&gNB->dci_generation_stats, "DCI encoding time");
printStatIndent2(&gNB->dlsch_encoding_stats,"DLSCH encoding time");
printStatIndent3(&gNB->dlsch_segmentation_stats,"DLSCH segmentation time");
printStatIndent3(&gNB->tinput,"DLSCH LDPC input processing time");
......@@ -1274,8 +1277,9 @@ printf("%d\n", slot);
printStatIndent3(&gNB->dlsch_interleaving_stats, "DLSCH Interleaving time");
printStatIndent2(&gNB->dlsch_modulation_stats,"DLSCH modulation time");
printStatIndent2(&gNB->dlsch_scrambling_stats, "DLSCH scrambling time");
printStatIndent2(&gNB->dlsch_resource_mapping_stats, "DLSCH Resource Mapping time");
printStatIndent2(&gNB->dlsch_precoding_stats,"DLSCH Layer Precoding time");
printStatIndent2(&gNB->dlsch_precoding_stats,"DLSCH Mapping/Precoding time");
if (gNB->phase_comp)
printStatIndent2(&gNB->phase_comp_stats, "Phase Compensation");
printf("\nUE function statistics (per %d us slot)\n", 1000 >> *scc->ssbSubcarrierSpacing);
for (int i = RX_PDSCH_STATS; i <= DLSCH_PROCEDURES_STATS; i++) {
......
Markdown is supported
0%
or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment