Commit ad712614 authored by Robert Schmidt's avatar Robert Schmidt

Merge remote-tracking branch 'origin/pdsch-precoding-opti' into integration_2023_w50

parents 34dd375a fcb23b44
......@@ -23,6 +23,8 @@
#include "PHY/NR_REFSIG/nr_mod_table.h"
#include "executables/softmodem-common.h"
// #define DEBUG_DLSCH_PRECODING_PRINT_WITH_TRIVIAL // TODO: For debug, to be removed if want to merge to develop
//Table 6.3.1.5-1 Precoding Matrix W 1 layer 2 antenna ports 'n' = -1 and 'o' = -j
const char nr_W_1l_2p[6][2][1] = {
{{'1'}, {'0'}}, // pmi 0
......@@ -711,3 +713,97 @@ c16_t nr_layer_precoder_cm(int n_layers,
return precodatatx_F;
}
void nr_layer_precoder_simd(const int n_layers,
const int n_symbols,
const int symSz,
const c16_t txdataF_res_mapped[n_layers][n_symbols][symSz],
const c16_t prec_matrix[n_layers],
const int symbol,
const int sc_offset,
const int re_cnt,
c16_t *txdataF_precoded)
{
uint32_t sc = sc_offset;
// For x86, use 256 SIMD for every 8 RE and 128 SIMD for last 4 RE
// For aarch64, use 128 SIMD for every 4 RE
// 256 SIMD: Do 8 RE in one iteration, 3 iterations for 2 RB
#ifdef __AVX2__
const uint32_t re_cnt_align8 = re_cnt & ~7;
for(; sc<sc_offset+(re_cnt_align8); sc+=sizeof(simde__m256i)/sizeof(*prec_matrix)){
// Matrix multiplication for 4 elements of the result (sizeof(simde__m256i) / sizeof(*prec_matrix) = 8)
simde__m256i y = simde_mm256_set1_epi16(0); // Y = W[0]*X[0] + W[1]*X[1] + ... + W[nrOfLayers-1]*X[nrOfLayers-1]
for(int nl=0; nl<n_layers; nl++){
const simde__m256i x = simde_mm256_loadu_epi32(&txdataF_res_mapped[nl][symbol][sc]);
// Rearrange precoding matrix weight to match complex multiplication and broadcast it to match SIMD size
const simde__m256i w_c = simde_mm256_set1_epi32(c16toI32(c16conj(prec_matrix[nl]))); // broadcast conjugate of w
const simde__m256i w_s = simde_mm256_set1_epi32(c16toI32(c16swap(prec_matrix[nl]))); // broadcast swapped real and img of w
// Multiplication and shift
const simde__m256i reals = simde_mm256_srai_epi32(simde_mm256_madd_epi16(x, w_c), 15); // (int32_t) .r = (x.r * w.r - x.i * w.i) >> 15
const simde__m256i imags = simde_mm256_slli_epi32(simde_mm256_madd_epi16(x, w_s), 1); // (int32_t) .i = (x.r * w.i + x.i * w.r) << 1, since higher 16 bit of each 32 bit is taken by blend_epi16
// Re-arrange to match c16_t format
const simde__m256i produ = simde_mm256_blend_epi16(reals, imags, 0xAA);
// Accumulate the product
y = simde_mm256_adds_epi16(y, produ);
}
// Store the result to txdataF
simde_mm256_storeu_si256(&txdataF_precoded[sc], y);
}
#endif
// 128 SIMD: Do 4 RE in one iteration, 3 iterations for 1 RB
const uint32_t re_cnt_align4 = re_cnt & ~3;
for(; sc<sc_offset+re_cnt_align4; sc+=sizeof(simde__m128i)/sizeof(*prec_matrix)){
#ifdef DEBUG_DLSCH_PRECODING_PRINT_WITH_TRIVIAL // Get result with trivial solution, TODO: To be removed
c16_t y_triv[4];
for(int i=0; i<4; i++)
y_triv[i] = nr_layer_precoder_cm(n_layers,
NR_SYMBOLS_PER_SLOT,
symSz,
txdataF_res_mapped,
prec_matrix,
symbol,
sc + i);
memcpy(&txdataF_precoded[sc], y_triv, sizeof(y_triv));
#endif
// Matrix multiplication for 4 elements of the result (sizeof(simde__m128i) / sizeof(c16_t) = 4)
simde__m128i y = simde_mm_set1_epi16(0); // Y = W[0]*X[0] + W[1]*X[1] + ... + W[nrOfLayers-1]*X[nrOfLayers-1]
for(int nl=0; nl<n_layers; nl++){
const simde__m128i x = simde_mm_loadu_epi32(&txdataF_res_mapped[nl][symbol][sc]);
// Rearrange precoding matrix weight to match complex multiplication and broadcast it to match SIMD size
const simde__m128i w_c = simde_mm_set1_epi32(c16toI32(c16conj(prec_matrix[nl]))); // broadcast conjugate of w
const simde__m128i w_s = simde_mm_set1_epi32(c16toI32(c16swap(prec_matrix[nl]))); // broadcast swapped real and img of w
// Multiplication and shift
const simde__m128i reals = simde_mm_srai_epi32(simde_mm_madd_epi16(x, w_c), 15); // (int32_t) .r = (x.r * w.r - x.i * w.i) >> 15
const simde__m128i imags = simde_mm_slli_epi32(simde_mm_madd_epi16(x, w_s), 1); // (int32_t) .i = (x.r * w.i + x.i * w.r) << 1, since higher 16 bit of each 32 bit is taken by blend_epi16
/* Re-arrange to match c16_t format
bit index: 0 | 16 | 32 | 48 | 64 | 80 | 96 | 112
reals = {R0.r[15..30] | R0.r[31] (0)*15 | R1.r[15..30] | R1.r[31] (0)*15 | R2.r[15..30] | R2.r[31] (0)*15 | R3.r[15..30] | R3.r[31] (0)*15}
imags = {0 R0.i[0..14]| R0.i[15..30] | 0 R1.i[0..14]| R1.i[15..30] | 0 R2.i[0..14]| R2.i[15..30] | 0 R3.i[0..14]| R3.i[15..30] }
16b from {reals | imags | reals | imags | reals | imags | reals | imags }
produ = {R0.r[15..30] | R0.i[15..30] | R1.r[15..30] | R1.i[15..30] | R2.r[15..30] | R2.i[15..30] | R3.r[15..30] | R3.i[15..30] }
*/
const simde__m128i produ = simde_mm_blend_epi16(reals, imags, 0xAA);
// Accumulate the product
y = simde_mm_adds_epi16(y, produ);
}
// Store the result to txdataF
simde_mm_storeu_si128(&txdataF_precoded[sc], y);
#ifdef DEBUG_DLSCH_PRECODING_PRINT_WITH_TRIVIAL // Print simd and trivial result, TODO: To be removed
c16_t *y_simd = (c16_t*) &y;
printf("debug_to_be_removed re_cnt=%d, sc=%d, y_simd=(%+4d,%+4d), (%+4d,%+4d), (%+4d,%+4d), (%+4d,%+4d)\n", re_cnt, sc, y_simd[0].r, y_simd[0].i, y_simd[1].r, y_simd[1].i, y_simd[2].r, y_simd[2].i, y_simd[3].r, y_simd[3].i);
printf("debug_to_be_removed re_cnt=%d, sc=%d, y_triv=(%+4d,%+4d), (%+4d,%+4d), (%+4d,%+4d), (%+4d,%+4d)\n", re_cnt, sc, y_triv[0].r, y_triv[0].i, y_triv[1].r, y_triv[1].i, y_triv[2].r, y_triv[2].i, y_triv[3].r, y_triv[3].i);
#endif
}
}
......@@ -145,4 +145,20 @@ c16_t nr_layer_precoder_cm(int n_layers,
c16_t *prec_matrix,
int symbol,
int offset);
/*! \brief Precoding with SIMDe, txdataF_precoded[] = prec_matrix[] * txdataF_res_mapped[]
@param[in] txdataF_res_mapped Tx data after resource mapping, before precoding.
@param[in] prec_matrix Weights of precoding matrix.
@param[in] re_cnt Number of RE (sub carrier) to write to txdataF_precoded, should be multiple of 4.
@param[out] txdataF_precoded Precoded antenna data
*/
void nr_layer_precoder_simd(const int n_layers,
const int n_symbols,
const int symSz,
const c16_t txdataF_res_mapped[n_layers][n_symbols][symSz],
const c16_t prec_matrix[n_layers],
const int symbol,
const int sc_offset,
const int re_cnt,
c16_t *txdataF_precoded);
#endif
......@@ -209,7 +209,7 @@ void nr_generate_pdsch(processingData_L1tx_t *msgTx, int frame, int slot)
if (start_sc >= frame_parms->ofdm_symbol_size)
start_sc -= frame_parms->ofdm_symbol_size;
const uint32_t txdataF_offset = slot*frame_parms->samples_per_slot_wCP;
const uint32_t txdataF_offset = slot * frame_parms->samples_per_slot_wCP;
c16_t txdataF_precoding[rel15->nrOfLayers][NR_NUMBER_OF_SYMBOLS_PER_SLOT][frame_parms->ofdm_symbol_size] __attribute__((aligned(64)));;
#ifdef DEBUG_DLSCH_MAPPING
......@@ -486,35 +486,40 @@ void nr_generate_pdsch(processingData_L1tx_t *msgTx, int frame, int slot)
for (int ant = 0; ant < frame_parms->nb_antennas_tx; ant++) {
for (int l_symbol = rel15->StartSymbolIndex; l_symbol < rel15->StartSymbolIndex + rel15->NrOfSymbols; l_symbol++) {
uint16_t subCarrier = start_sc;
for (int rb=0; rb<rel15->rbSize; rb++) {
const size_t txdataF_offset_per_symbol = l_symbol * frame_parms->ofdm_symbol_size + txdataF_offset;
const size_t txdataF_offset_per_symbol = l_symbol * frame_parms->ofdm_symbol_size + txdataF_offset;
int rb = 0;
while(rb < rel15->rbSize) {
//get pmi info
uint8_t pmi;
if (0 /*rel15->precodingAndBeamforming.prg_size > 0*/)
pmi = rel15->precodingAndBeamforming.prgs_list[(int)rb/rel15->precodingAndBeamforming.prg_size].pm_idx;
else
pmi = 0;//no precoding
const int pmi = (rel15->precodingAndBeamforming.prg_size > 0) ?
(rel15->precodingAndBeamforming.prgs_list[(int)rb/rel15->precodingAndBeamforming.prg_size].pm_idx) : 0;
const int pmi2 = (rb < (rel15->rbSize - 1) && rel15->precodingAndBeamforming.prg_size > 0) ?
(rel15->precodingAndBeamforming.prgs_list[(int)(rb+1)/rel15->precodingAndBeamforming.prg_size].pm_idx) : -1;
// If pmi of next RB and pmi of current RB are the same, we do 2 RB in a row
// if pmi differs, or current rb is the end (rel15->rbSize - 1), than we do 1 RB in a row
const int rb_step = pmi == pmi2 ? 2 : 1;
const int re_cnt = NR_NB_SC_PER_RB * rb_step;
if (pmi == 0) {//unitary Precoding
if (subCarrier + NR_NB_SC_PER_RB <= frame_parms->ofdm_symbol_size) { // RB does not cross DC
if (subCarrier + re_cnt <= frame_parms->ofdm_symbol_size) { // RB does not cross DC
if (ant < rel15->nrOfLayers)
memcpy(&txdataF[ant][txdataF_offset_per_symbol + subCarrier],
&txdataF_precoding[ant][l_symbol][subCarrier],
NR_NB_SC_PER_RB * sizeof(**txdataF));
memcpy(&txdataF[ant][txdataF_offset_per_symbol + subCarrier],
&txdataF_precoding[ant][l_symbol][subCarrier],
re_cnt * sizeof(**txdataF));
else
memset(&txdataF[ant][txdataF_offset_per_symbol + subCarrier],
0,
NR_NB_SC_PER_RB * sizeof(**txdataF));
re_cnt * sizeof(**txdataF));
} else { // RB does cross DC
int neg_length = frame_parms->ofdm_symbol_size - subCarrier;
int pos_length = NR_NB_SC_PER_RB - neg_length;
const int neg_length = frame_parms->ofdm_symbol_size - subCarrier;
const int pos_length = re_cnt - neg_length;
if (ant < rel15->nrOfLayers) {
memcpy(&txdataF[ant][txdataF_offset_per_symbol + subCarrier],
&txdataF_precoding[ant][l_symbol][subCarrier],
neg_length * sizeof(**txdataF));
memcpy(&txdataF[ant][txdataF_offset_per_symbol], &txdataF_precoding[ant][l_symbol], pos_length * sizeof(**txdataF));
memcpy(&txdataF[ant][txdataF_offset_per_symbol],
&txdataF_precoding[ant][l_symbol],
pos_length * sizeof(**txdataF));
} else {
memset(&txdataF[ant][txdataF_offset_per_symbol + subCarrier],
0,
......@@ -524,51 +529,68 @@ void nr_generate_pdsch(processingData_L1tx_t *msgTx, int frame, int slot)
pos_length * sizeof(**txdataF));
}
}
subCarrier += NR_NB_SC_PER_RB;
subCarrier += re_cnt;
if (subCarrier >= frame_parms->ofdm_symbol_size) {
subCarrier -= frame_parms->ofdm_symbol_size;
}
}
else {
if(frame_parms->nb_antennas_tx==1){//no precoding matrix defined
else { // non-unitary Precoding
if(frame_parms->nb_antennas_tx == 1){ // no precoding matrix defined
memcpy(&txdataF[ant][txdataF_offset_per_symbol + subCarrier],
&txdataF_precoding[ant][l_symbol][subCarrier],
NR_NB_SC_PER_RB * sizeof(**txdataF));
subCarrier += NR_NB_SC_PER_RB;
re_cnt * sizeof(**txdataF));
subCarrier += re_cnt;
if (subCarrier >= frame_parms->ofdm_symbol_size) {
subCarrier -= frame_parms->ofdm_symbol_size;
}
}
else {
else { // precoding with more than 1 tx
//get the precoding matrix weights:
c16_t **mat = (c16_t**)gNB->nr_mimo_precoding_matrix[rel15->nrOfLayers - 1];
//i_row =0,...,dl_antenna_port
//j_col =0,...,nrOfLayers
//mat[pmi][i_rows*2+j_col]
c16_t *W_prec = &mat[pmi][ant * rel15->nrOfLayers];
for (int i=0; i<NR_NB_SC_PER_RB; i++) {
txdataF[ant][txdataF_offset_per_symbol + subCarrier] = nr_layer_precoder_cm(rel15->nrOfLayers,
NR_SYMBOLS_PER_SLOT,
frame_parms->ofdm_symbol_size,
txdataF_precoding,
W_prec,
l_symbol,
subCarrier);
if((subCarrier + re_cnt) < frame_parms->ofdm_symbol_size){ // within ofdm_symbol_size, use SIMDe
nr_layer_precoder_simd(rel15->nrOfLayers,
NR_SYMBOLS_PER_SLOT,
frame_parms->ofdm_symbol_size,
txdataF_precoding,
W_prec,
l_symbol,
subCarrier,
re_cnt,
&txdataF[ant][txdataF_offset_per_symbol]);
subCarrier += re_cnt;
}
else{ // crossing ofdm_symbol_size, use simple arithmetic operations
for (int i = 0; i < re_cnt; i++) {
txdataF[ant][txdataF_offset_per_symbol + subCarrier] =
nr_layer_precoder_cm(rel15->nrOfLayers,
NR_SYMBOLS_PER_SLOT,
frame_parms->ofdm_symbol_size,
txdataF_precoding,
W_prec,
l_symbol,
subCarrier);
#ifdef DEBUG_DLSCH_MAPPING
printf("antenna %d\t l %d \t subCarrier %d \t txdataF: %d %d\n",
ant,
l_symbol,
subCarrier,
txdataF[ant][l_symbol * frame_parms->ofdm_symbol_size + subCarrier + txdataF_offset].r,
txdataF[ant][l_symbol * frame_parms->ofdm_symbol_size + subCarrier + txdataF_offset].i);
printf("antenna %d\t l %d \t subCarrier %d \t txdataF: %d %d\n",
ant,
symbol,
subCarrier,
txdataF[ant][l_symbol * frame_parms->ofdm_symbol_size + subCarrier + txdataF_offset].r,
txdataF[ant][l_symbol * frame_parms->ofdm_symbol_size + subCarrier + txdataF_offset].i);
#endif
if (++subCarrier >= frame_parms->ofdm_symbol_size) {
subCarrier -= frame_parms->ofdm_symbol_size;
if (++subCarrier >= frame_parms->ofdm_symbol_size) {
subCarrier -= frame_parms->ofdm_symbol_size;
}
}
}
}
}
} //RB loop
} // else{ // crossing ofdm_symbol_size, use simple arithmetic operations
} // else { // precoding with more than 1 tx
} // else { // non-unitary Precoding
rb += rb_step;
} // RB loop: while(rb < rel15->rbSize)
} // symbol loop
} // port loop
......
......@@ -161,6 +161,24 @@ extern "C" {
#define squaredMod(a) ((a).r*(a).r + (a).i*(a).i)
#define csum(res, i1, i2) (res).r = (i1).r + (i2).r ; (res).i = (i1).i + (i2).i
__attribute__((always_inline)) inline c16_t c16conj(const c16_t a) {
return (c16_t) {
.r = a.r,
.i = (int16_t)-a.i
};
}
__attribute__((always_inline)) inline uint32_t c16toI32(const c16_t a) {
return *((uint32_t*)&a);
}
__attribute__((always_inline)) inline c16_t c16swap(const c16_t a) {
return (c16_t){
.r = a.i,
.i = a.r
};
}
__attribute__((always_inline)) inline uint32_t c16amp2(const c16_t a) {
return a.r * a.r + a.i * a.i;
}
......
Markdown is supported
0%
or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment