Commit f5dccc2a authored by rmagueta's avatar rmagueta

Improvements in nr_dlsch_channel_compensation() function

parent af498835
...@@ -738,6 +738,20 @@ void nr_dlsch_deinterleaving(uint8_t symbol, ...@@ -738,6 +738,20 @@ void nr_dlsch_deinterleaving(uint8_t symbol,
// Pre-processing for LLR computation // Pre-processing for LLR computation
//============================================================================================== //==============================================================================================
simde__m128i nr_dlsch_a_mult_conjb(simde__m128i a, simde__m128i b, unsigned char output_shift)
{
simde__m128i mmtmpD0 = simde_mm_madd_epi16(b, a);
simde__m128i mmtmpD1 = simde_mm_shufflelo_epi16(b, SIMDE_MM_SHUFFLE(2, 3, 0, 1));
mmtmpD1 = simde_mm_shufflehi_epi16(mmtmpD1, SIMDE_MM_SHUFFLE(2, 3, 0, 1));
mmtmpD1 = simde_mm_sign_epi16(mmtmpD1, *(simde__m128i *)&conjugate[0]);
mmtmpD1 = simde_mm_madd_epi16(mmtmpD1, a);
mmtmpD0 = simde_mm_srai_epi32(mmtmpD0, output_shift);
mmtmpD1 = simde_mm_srai_epi32(mmtmpD1, output_shift);
simde__m128i mmtmpD2 = simde_mm_unpacklo_epi32(mmtmpD0, mmtmpD1);
simde__m128i mmtmpD3 = simde_mm_unpackhi_epi32(mmtmpD0, mmtmpD1);
return simde_mm_packs_epi32(mmtmpD2, mmtmpD3);
}
static void nr_dlsch_channel_compensation(uint32_t rx_size_symbol, static void nr_dlsch_channel_compensation(uint32_t rx_size_symbol,
int nbRx, int nbRx,
c16_t rxdataF_ext[][rx_size_symbol], c16_t rxdataF_ext[][rx_size_symbol],
...@@ -757,21 +771,18 @@ static void nr_dlsch_channel_compensation(uint32_t rx_size_symbol, ...@@ -757,21 +771,18 @@ static void nr_dlsch_channel_compensation(uint32_t rx_size_symbol,
unsigned char output_shift, unsigned char output_shift,
PHY_NR_MEASUREMENTS *measurements) PHY_NR_MEASUREMENTS *measurements)
{ {
simde__m128i *dl_ch128, *dl_ch128_2, *dl_ch_mag128, *dl_ch_mag128b, *dl_ch_mag128r, *rxdataF128, *rxdataF_comp128, *rho128;
simde__m128i mmtmpD0, mmtmpD1, mmtmpD2, mmtmpD3, QAM_amp128 = {0}, QAM_amp128b = {0}, QAM_amp128r = {0};
unsigned short rb;
unsigned char aarx,atx;
simde__m128i *dl_ch128,*dl_ch128_2,*dl_ch_mag128,*dl_ch_mag128b,*dl_ch_mag128r,*rxdataF128,*rxdataF_comp128,*rho128;
simde__m128i mmtmpD0,mmtmpD1,mmtmpD2,mmtmpD3,QAM_amp128={0},QAM_amp128b={0},QAM_amp128r={0};
uint32_t nb_rb_0 = length / 12 + ((length % 12) ? 1 : 0); uint32_t nb_rb_0 = length / 12 + ((length % 12) ? 1 : 0);
for (int l = 0; l < n_layers; l++) { for (int l = 0; l < n_layers; l++) {
if (mod_order == 4) { if (mod_order == 4) {
QAM_amp128 = simde_mm_set1_epi16(QAM16_n1); // 2/sqrt(10) QAM_amp128 = simde_mm_set1_epi16(QAM16_n1); // 2/sqrt(10)
QAM_amp128b = simde_mm_setzero_si128(); QAM_amp128b = simde_mm_setzero_si128();
QAM_amp128r = simde_mm_setzero_si128(); QAM_amp128r = simde_mm_setzero_si128();
} else if (mod_order == 6) { } else if (mod_order == 6) {
QAM_amp128 = simde_mm_set1_epi16(QAM64_n1); // QAM_amp128 = simde_mm_set1_epi16(QAM64_n1); //
QAM_amp128b = simde_mm_set1_epi16(QAM64_n2); QAM_amp128b = simde_mm_set1_epi16(QAM64_n2);
QAM_amp128r = simde_mm_setzero_si128(); QAM_amp128r = simde_mm_setzero_si128();
} else if (mod_order == 8) { } else if (mod_order == 8) {
...@@ -782,257 +793,173 @@ static void nr_dlsch_channel_compensation(uint32_t rx_size_symbol, ...@@ -782,257 +793,173 @@ static void nr_dlsch_channel_compensation(uint32_t rx_size_symbol,
// printf("comp: rxdataF_comp %p, symbol %d\n",rxdataF_comp[0],symbol); // printf("comp: rxdataF_comp %p, symbol %d\n",rxdataF_comp[0],symbol);
for (aarx=0; aarx<frame_parms->nb_antennas_rx; aarx++) { for (int aarx = 0; aarx < frame_parms->nb_antennas_rx; aarx++) {
dl_ch128 = (simde__m128i *)dl_ch_estimates_ext[(l * frame_parms->nb_antennas_rx) + aarx];
dl_ch128 = (simde__m128i *)dl_ch_estimates_ext[(l * frame_parms->nb_antennas_rx) + aarx];
dl_ch_mag128 = (simde__m128i *)dl_ch_mag[l][aarx]; dl_ch_mag128 = (simde__m128i *)dl_ch_mag[l][aarx];
dl_ch_mag128b = (simde__m128i *)dl_ch_magb[l][aarx]; dl_ch_mag128b = (simde__m128i *)dl_ch_magb[l][aarx];
dl_ch_mag128r = (simde__m128i *)dl_ch_magr[l][aarx]; dl_ch_mag128r = (simde__m128i *)dl_ch_magr[l][aarx];
rxdataF128 = (simde__m128i *)rxdataF_ext[aarx]; rxdataF128 = (simde__m128i *)rxdataF_ext[aarx];
rxdataF_comp128 = (simde__m128i *)(rxdataF_comp[l][aarx] + symbol * nb_rb * 12); rxdataF_comp128 = (simde__m128i *)(rxdataF_comp[l][aarx] + symbol * nb_rb * 12);
for (rb=0; rb<nb_rb_0; rb++) { for (int rb = 0; rb < nb_rb_0; rb++) {
if (mod_order>2) { if (mod_order > 2) {
// get channel amplitude if not QPSK // get channel amplitude if not QPSK
mmtmpD0 = simde_mm_madd_epi16(dl_ch128[0],dl_ch128[0]); mmtmpD0 = simde_mm_madd_epi16(dl_ch128[0], dl_ch128[0]);
mmtmpD0 = simde_mm_srai_epi32(mmtmpD0,output_shift); mmtmpD0 = simde_mm_srai_epi32(mmtmpD0, output_shift);
mmtmpD1 = simde_mm_madd_epi16(dl_ch128[1],dl_ch128[1]);
mmtmpD1 = simde_mm_srai_epi32(mmtmpD1,output_shift); mmtmpD1 = simde_mm_madd_epi16(dl_ch128[1], dl_ch128[1]);
mmtmpD0 = simde_mm_packs_epi32(mmtmpD0,mmtmpD1); //|H[0]|^2 |H[1]|^2 |H[2]|^2 |H[3]|^2 |H[4]|^2 |H[5]|^2 |H[6]|^2 |H[7]|^2 mmtmpD1 = simde_mm_srai_epi32(mmtmpD1, output_shift);
mmtmpD0 = simde_mm_packs_epi32(mmtmpD0, mmtmpD1); //|H[0]|^2 |H[1]|^2 |H[2]|^2 |H[3]|^2 |H[4]|^2 |H[5]|^2 |H[6]|^2 |H[7]|^2
// store channel magnitude here in a new field of dlsch // store channel magnitude here in a new field of dlsch
dl_ch_mag128[0] = simde_mm_unpacklo_epi16(mmtmpD0,mmtmpD0); dl_ch_mag128[0] = simde_mm_unpacklo_epi16(mmtmpD0, mmtmpD0);
dl_ch_mag128b[0] = dl_ch_mag128[0]; dl_ch_mag128b[0] = dl_ch_mag128[0];
dl_ch_mag128r[0] = dl_ch_mag128[0]; dl_ch_mag128r[0] = dl_ch_mag128[0];
dl_ch_mag128[0] = simde_mm_mulhi_epi16(dl_ch_mag128[0],QAM_amp128); dl_ch_mag128[0] = simde_mm_mulhrs_epi16(dl_ch_mag128[0], QAM_amp128);
dl_ch_mag128[0] = simde_mm_slli_epi16(dl_ch_mag128[0],1); dl_ch_mag128b[0] = simde_mm_mulhrs_epi16(dl_ch_mag128b[0], QAM_amp128b);
dl_ch_mag128r[0] = simde_mm_mulhrs_epi16(dl_ch_mag128r[0], QAM_amp128r);
dl_ch_mag128b[0] = simde_mm_mulhi_epi16(dl_ch_mag128b[0],QAM_amp128b);
dl_ch_mag128b[0] = simde_mm_slli_epi16(dl_ch_mag128b[0],1);
dl_ch_mag128r[0] = simde_mm_mulhi_epi16(dl_ch_mag128r[0],QAM_amp128r); dl_ch_mag128[1] = simde_mm_unpackhi_epi16(mmtmpD0, mmtmpD0);
dl_ch_mag128r[0] = simde_mm_slli_epi16(dl_ch_mag128r[0],1);
//print_ints("Re(ch):",(int16_t*)&mmtmpD0);
//print_shorts("QAM_amp:",(int16_t*)&QAM_amp128);
//print_shorts("mag:",(int16_t*)&dl_ch_mag128[0]);
dl_ch_mag128[1] = simde_mm_unpackhi_epi16(mmtmpD0,mmtmpD0);
dl_ch_mag128b[1] = dl_ch_mag128[1]; dl_ch_mag128b[1] = dl_ch_mag128[1];
dl_ch_mag128r[1] = dl_ch_mag128[1]; dl_ch_mag128r[1] = dl_ch_mag128[1];
dl_ch_mag128[1] = simde_mm_mulhi_epi16(dl_ch_mag128[1],QAM_amp128); dl_ch_mag128[1] = simde_mm_mulhrs_epi16(dl_ch_mag128[1], QAM_amp128);
dl_ch_mag128[1] = simde_mm_slli_epi16(dl_ch_mag128[1],1); dl_ch_mag128b[1] = simde_mm_mulhrs_epi16(dl_ch_mag128b[1], QAM_amp128b);
dl_ch_mag128r[1] = simde_mm_mulhrs_epi16(dl_ch_mag128r[1], QAM_amp128r);
dl_ch_mag128b[1] = simde_mm_mulhi_epi16(dl_ch_mag128b[1],QAM_amp128b);
dl_ch_mag128b[1] = simde_mm_slli_epi16(dl_ch_mag128b[1],1);
dl_ch_mag128r[1] = simde_mm_mulhi_epi16(dl_ch_mag128r[1],QAM_amp128r); mmtmpD0 = simde_mm_madd_epi16(dl_ch128[2], dl_ch128[2]);
dl_ch_mag128r[1] = simde_mm_slli_epi16(dl_ch_mag128r[1],1); mmtmpD0 = simde_mm_srai_epi32(mmtmpD0, output_shift);
mmtmpD1 = simde_mm_packs_epi32(mmtmpD0, mmtmpD0);
mmtmpD0 = simde_mm_madd_epi16(dl_ch128[2],dl_ch128[2]);//[H_I(0)^2+H_Q(0)^2 H_I(1)^2+H_Q(1)^2 H_I(2)^2+H_Q(2)^2 H_I(3)^2+H_Q(3)^2] dl_ch_mag128[2] = simde_mm_unpacklo_epi16(mmtmpD1, mmtmpD1);
mmtmpD0 = simde_mm_srai_epi32(mmtmpD0,output_shift);
mmtmpD1 = simde_mm_packs_epi32(mmtmpD0,mmtmpD0);//[|H(0)|^2 |H(1)|^2 |H(2)|^2 |H(3)|^2 |H(0)|^2 |H(1)|^2 |H(2)|^2 |H(3)|^2]
dl_ch_mag128[2] = simde_mm_unpacklo_epi16(mmtmpD1,mmtmpD1);//[|H(0)|^2 |H(0)|^2 |H(1)|^2 |H(1)|^2 |H(2)|^2 |H(2)|^2 |H(3)|^2 |H(3)|^2]
dl_ch_mag128b[2] = dl_ch_mag128[2]; dl_ch_mag128b[2] = dl_ch_mag128[2];
dl_ch_mag128r[2] = dl_ch_mag128[2]; dl_ch_mag128r[2] = dl_ch_mag128[2];
dl_ch_mag128[2] = simde_mm_mulhi_epi16(dl_ch_mag128[2],QAM_amp128); dl_ch_mag128[2] = simde_mm_mulhrs_epi16(dl_ch_mag128[2], QAM_amp128);
dl_ch_mag128[2] = simde_mm_slli_epi16(dl_ch_mag128[2],1); dl_ch_mag128b[2] = simde_mm_mulhrs_epi16(dl_ch_mag128b[2], QAM_amp128b);
dl_ch_mag128r[2] = simde_mm_mulhrs_epi16(dl_ch_mag128r[2], QAM_amp128r);
dl_ch_mag128b[2] = simde_mm_mulhi_epi16(dl_ch_mag128b[2],QAM_amp128b);
dl_ch_mag128b[2] = simde_mm_slli_epi16(dl_ch_mag128b[2],1);
dl_ch_mag128r[2] = simde_mm_mulhi_epi16(dl_ch_mag128r[2],QAM_amp128r);
dl_ch_mag128r[2] = simde_mm_slli_epi16(dl_ch_mag128r[2],1);
} }
// multiply by conjugated channel // Multiply received data by conjugated channel
mmtmpD0 = simde_mm_madd_epi16(dl_ch128[0],rxdataF128[0]); rxdataF_comp128[0] = nr_dlsch_a_mult_conjb(rxdataF128[0], dl_ch128[0], output_shift);
// print_ints("re",&mmtmpD0); rxdataF_comp128[1] = nr_dlsch_a_mult_conjb(rxdataF128[1], dl_ch128[1], output_shift);
rxdataF_comp128[2] = nr_dlsch_a_mult_conjb(rxdataF128[2], dl_ch128[2], output_shift);
// mmtmpD0 contains real part of 4 consecutive outputs (32-bit)
mmtmpD1 = simde_mm_shufflelo_epi16(dl_ch128[0],SIMDE_MM_SHUFFLE(2,3,0,1)); dl_ch128 += 3;
mmtmpD1 = simde_mm_shufflehi_epi16(mmtmpD1,SIMDE_MM_SHUFFLE(2,3,0,1)); dl_ch_mag128 += 3;
mmtmpD1 = simde_mm_sign_epi16(mmtmpD1,*(simde__m128i*)&conjugate[0]); dl_ch_mag128b += 3;
// print_ints("im",&mmtmpD1); dl_ch_mag128r += 3;
mmtmpD1 = simde_mm_madd_epi16(mmtmpD1,rxdataF128[0]); rxdataF128 += 3;
// mmtmpD1 contains imag part of 4 consecutive outputs (32-bit) rxdataF_comp128 += 3;
mmtmpD0 = simde_mm_srai_epi32(mmtmpD0,output_shift);
// print_ints("re(shift)",&mmtmpD0);
mmtmpD1 = simde_mm_srai_epi32(mmtmpD1,output_shift);
// print_ints("im(shift)",&mmtmpD1);
mmtmpD2 = simde_mm_unpacklo_epi32(mmtmpD0,mmtmpD1);
mmtmpD3 = simde_mm_unpackhi_epi32(mmtmpD0,mmtmpD1);
// print_ints("c0",&mmtmpD2);
// print_ints("c1",&mmtmpD3);
rxdataF_comp128[0] = simde_mm_packs_epi32(mmtmpD2,mmtmpD3);
#ifdef DEBUG_DLSCH_DEMOD
printf("%%arx%d atx%d rb_index %d symbol %d shift %d\n",aarx,l,rb,symbol,output_shift);
printf("rx_%d(%d,:)",aarx+1,rb+1);
print_shorts(" ",(int16_t *)&rxdataF128[0]);
printf("ch_%d%d(%d,:)",aarx+1,l+1,rb+1);
print_shorts(" ",(int16_t *)&dl_ch128[0]);
printf("rx_comp_%d%d(%d,:)",aarx+1,l+1,rb+1);
print_shorts(" ",(int16_t *)&rxdataF_comp128[0]);
#endif
// multiply by conjugated channel
mmtmpD0 = simde_mm_madd_epi16(dl_ch128[1],rxdataF128[1]);
// mmtmpD0 contains real part of 4 consecutive outputs (32-bit)
mmtmpD1 = simde_mm_shufflelo_epi16(dl_ch128[1],SIMDE_MM_SHUFFLE(2,3,0,1));
mmtmpD1 = simde_mm_shufflehi_epi16(mmtmpD1,SIMDE_MM_SHUFFLE(2,3,0,1));
mmtmpD1 = simde_mm_sign_epi16(mmtmpD1,*(simde__m128i*)conjugate);
mmtmpD1 = simde_mm_madd_epi16(mmtmpD1,rxdataF128[1]);
// mmtmpD1 contains imag part of 4 consecutive outputs (32-bit)
mmtmpD0 = simde_mm_srai_epi32(mmtmpD0,output_shift);
mmtmpD1 = simde_mm_srai_epi32(mmtmpD1,output_shift);
mmtmpD2 = simde_mm_unpacklo_epi32(mmtmpD0,mmtmpD1);
mmtmpD3 = simde_mm_unpackhi_epi32(mmtmpD0,mmtmpD1);
rxdataF_comp128[1] = simde_mm_packs_epi32(mmtmpD2,mmtmpD3);
#ifdef DEBUG_DLSCH_DEMOD
print_shorts("rx:",(int16_t*)&rxdataF128[1]);
print_shorts("ch:",(int16_t*)&dl_ch128[1]);
print_shorts("pack:",(int16_t*)&rxdataF_comp128[1]);
#endif
// multiply by conjugated channel
mmtmpD0 = simde_mm_madd_epi16(dl_ch128[2],rxdataF128[2]);
// mmtmpD0 contains real part of 4 consecutive outputs (32-bit)
mmtmpD1 = simde_mm_shufflelo_epi16(dl_ch128[2],SIMDE_MM_SHUFFLE(2,3,0,1));
mmtmpD1 = simde_mm_shufflehi_epi16(mmtmpD1,SIMDE_MM_SHUFFLE(2,3,0,1));
mmtmpD1 = simde_mm_sign_epi16(mmtmpD1,*(simde__m128i*)conjugate);
mmtmpD1 = simde_mm_madd_epi16(mmtmpD1,rxdataF128[2]);
// mmtmpD1 contains imag part of 4 consecutive outputs (32-bit)
mmtmpD0 = simde_mm_srai_epi32(mmtmpD0,output_shift);
mmtmpD1 = simde_mm_srai_epi32(mmtmpD1,output_shift);
mmtmpD2 = simde_mm_unpacklo_epi32(mmtmpD0,mmtmpD1);
mmtmpD3 = simde_mm_unpackhi_epi32(mmtmpD0,mmtmpD1);
rxdataF_comp128[2] = simde_mm_packs_epi32(mmtmpD2,mmtmpD3);
#ifdef DEBUG_DLSCH_DEMOD
print_shorts("rx:",(int16_t*)&rxdataF128[2]);
print_shorts("ch:",(int16_t*)&dl_ch128[2]);
print_shorts("pack:",(int16_t*)&rxdataF_comp128[2]);
#endif
dl_ch128+=3;
dl_ch_mag128+=3;
dl_ch_mag128b+=3;
dl_ch_mag128r+=3;
rxdataF128+=3;
rxdataF_comp128+=3;
} }
} }
} }
if (rho) {
//we compute the Tx correlation matrix for each Rx antenna
//As an example the 2x2 MIMO case requires
//rho[aarx][nl*nl] = [cov(H_aarx_0,H_aarx_0) cov(H_aarx_0,H_aarx_1)
// cov(H_aarx_1,H_aarx_0) cov(H_aarx_1,H_aarx_1)], aarx=0,...,nb_antennas_rx-1
//int avg_rho_re[frame_parms->nb_antennas_rx][nl*nl]; if (rho) {
//int avg_rho_im[frame_parms->nb_antennas_rx][nl*nl]; // we compute the Tx correlation matrix for each Rx antenna
// As an example the 2x2 MIMO case requires
// rho[aarx][nl*nl] = [cov(H_aarx_0,H_aarx_0) cov(H_aarx_0,H_aarx_1)
// cov(H_aarx_1,H_aarx_0) cov(H_aarx_1,H_aarx_1)], aarx=0,...,nb_antennas_rx-1
for (aarx=0; aarx<frame_parms->nb_antennas_rx; aarx++) { // int avg_rho_re[frame_parms->nb_antennas_rx][nl*nl];
// int avg_rho_im[frame_parms->nb_antennas_rx][nl*nl];
for (int aarx = 0; aarx < frame_parms->nb_antennas_rx; aarx++) {
for (int l = 0; l < n_layers; l++) { for (int l = 0; l < n_layers; l++) {
for (int atx = 0; atx < n_layers; atx++) {
for (atx = 0; atx < n_layers; atx++) { // avg_rho_re[aarx][l*n_layers+atx] = 0;
//avg_rho_re[aarx][l*n_layers+atx] = 0; // avg_rho_im[aarx][l*n_layers+atx] = 0;
//avg_rho_im[aarx][l*n_layers+atx] = 0; rho128 = (simde__m128i *)&rho[aarx][l * n_layers + atx][symbol * nb_rb * 12];
rho128 = (simde__m128i *)&rho[aarx][l * n_layers + atx][symbol * nb_rb * 12]; dl_ch128 = (simde__m128i *)dl_ch_estimates_ext[l * frame_parms->nb_antennas_rx + aarx];
dl_ch128 = (simde__m128i *)dl_ch_estimates_ext[l * frame_parms->nb_antennas_rx + aarx]; dl_ch128_2 = (simde__m128i *)dl_ch_estimates_ext[atx * frame_parms->nb_antennas_rx + aarx];
dl_ch128_2 = (simde__m128i *)dl_ch_estimates_ext[atx * frame_parms->nb_antennas_rx + aarx];
for (int rb = 0; rb < nb_rb_0; rb++) {
for (rb=0; rb<nb_rb_0; rb++) { // multiply by conjugated channel
// multiply by conjugated channel mmtmpD0 = simde_mm_madd_epi16(dl_ch128[0], dl_ch128_2[0]);
mmtmpD0 = simde_mm_madd_epi16(dl_ch128[0],dl_ch128_2[0]); // print_ints("re",&mmtmpD0);
// print_ints("re",&mmtmpD0); // mmtmpD0 contains real part of 4 consecutive outputs (32-bit)
// mmtmpD0 contains real part of 4 consecutive outputs (32-bit) mmtmpD1 = simde_mm_shufflelo_epi16(dl_ch128[0], SIMDE_MM_SHUFFLE(2, 3, 0, 1));
mmtmpD1 = simde_mm_shufflelo_epi16(dl_ch128[0],SIMDE_MM_SHUFFLE(2,3,0,1)); mmtmpD1 = simde_mm_shufflehi_epi16(mmtmpD1, SIMDE_MM_SHUFFLE(2, 3, 0, 1));
mmtmpD1 = simde_mm_shufflehi_epi16(mmtmpD1,SIMDE_MM_SHUFFLE(2,3,0,1)); mmtmpD1 = simde_mm_sign_epi16(mmtmpD1, *(simde__m128i *)&conjugate[0]);
mmtmpD1 = simde_mm_sign_epi16(mmtmpD1,*(simde__m128i*)&conjugate[0]); // print_ints("im",&mmtmpD1);
// print_ints("im",&mmtmpD1); mmtmpD1 = simde_mm_madd_epi16(mmtmpD1, dl_ch128_2[0]);
mmtmpD1 = simde_mm_madd_epi16(mmtmpD1,dl_ch128_2[0]); // mmtmpD1 contains imag part of 4 consecutive outputs (32-bit)
// mmtmpD1 contains imag part of 4 consecutive outputs (32-bit) mmtmpD0 = simde_mm_srai_epi32(mmtmpD0, output_shift);
mmtmpD0 = simde_mm_srai_epi32(mmtmpD0,output_shift); // print_ints("re(shift)",&mmtmpD0);
// print_ints("re(shift)",&mmtmpD0); mmtmpD1 = simde_mm_srai_epi32(mmtmpD1, output_shift);
mmtmpD1 = simde_mm_srai_epi32(mmtmpD1,output_shift); // print_ints("im(shift)",&mmtmpD1);
// print_ints("im(shift)",&mmtmpD1); mmtmpD2 = simde_mm_unpacklo_epi32(mmtmpD0, mmtmpD1);
mmtmpD2 = simde_mm_unpacklo_epi32(mmtmpD0,mmtmpD1); mmtmpD3 = simde_mm_unpackhi_epi32(mmtmpD0, mmtmpD1);
mmtmpD3 = simde_mm_unpackhi_epi32(mmtmpD0,mmtmpD1); // print_ints("c0",&mmtmpD2);
// print_ints("c0",&mmtmpD2); // print_ints("c1",&mmtmpD3);
// print_ints("c1",&mmtmpD3); rho128[0] = simde_mm_packs_epi32(mmtmpD2, mmtmpD3);
rho128[0] = simde_mm_packs_epi32(mmtmpD2,mmtmpD3); // print_shorts("rx:",dl_ch128_2);
//print_shorts("rx:",dl_ch128_2); // print_shorts("ch:",dl_ch128);
//print_shorts("ch:",dl_ch128); // print_shorts("pack:",rho128);
//print_shorts("pack:",rho128);
/*avg_rho_re[aarx][l*n_layers+atx] +=(((int16_t*)&rho128[0])[0]+
/*avg_rho_re[aarx][l*n_layers+atx] +=(((int16_t*)&rho128[0])[0]+ ((int16_t*)&rho128[0])[2] +
((int16_t*)&rho128[0])[2] + ((int16_t*)&rho128[0])[4] +
((int16_t*)&rho128[0])[4] + ((int16_t*)&rho128[0])[6])/16;*/
((int16_t*)&rho128[0])[6])/16;*/ /*avg_rho_im[aarx][l*n_layers+atx] +=(((int16_t*)&rho128[0])[1]+
/*avg_rho_im[aarx][l*n_layers+atx] +=(((int16_t*)&rho128[0])[1]+ ((int16_t*)&rho128[0])[3] +
((int16_t*)&rho128[0])[3] + ((int16_t*)&rho128[0])[5] +
((int16_t*)&rho128[0])[5] + ((int16_t*)&rho128[0])[7])/16;*/
((int16_t*)&rho128[0])[7])/16;*/
// multiply by conjugated channel
// multiply by conjugated channel mmtmpD0 = simde_mm_madd_epi16(dl_ch128[1], dl_ch128_2[1]);
mmtmpD0 = simde_mm_madd_epi16(dl_ch128[1],dl_ch128_2[1]); // mmtmpD0 contains real part of 4 consecutive outputs (32-bit)
// mmtmpD0 contains real part of 4 consecutive outputs (32-bit) mmtmpD1 = simde_mm_shufflelo_epi16(dl_ch128[1], SIMDE_MM_SHUFFLE(2, 3, 0, 1));
mmtmpD1 = simde_mm_shufflelo_epi16(dl_ch128[1],SIMDE_MM_SHUFFLE(2,3,0,1)); mmtmpD1 = simde_mm_shufflehi_epi16(mmtmpD1, SIMDE_MM_SHUFFLE(2, 3, 0, 1));
mmtmpD1 = simde_mm_shufflehi_epi16(mmtmpD1,SIMDE_MM_SHUFFLE(2,3,0,1)); mmtmpD1 = simde_mm_sign_epi16(mmtmpD1, *(simde__m128i *)conjugate);
mmtmpD1 = simde_mm_sign_epi16(mmtmpD1,*(simde__m128i*)conjugate); mmtmpD1 = simde_mm_madd_epi16(mmtmpD1, dl_ch128_2[1]);
mmtmpD1 = simde_mm_madd_epi16(mmtmpD1,dl_ch128_2[1]); // mmtmpD1 contains imag part of 4 consecutive outputs (32-bit)
// mmtmpD1 contains imag part of 4 consecutive outputs (32-bit) mmtmpD0 = simde_mm_srai_epi32(mmtmpD0, output_shift);
mmtmpD0 = simde_mm_srai_epi32(mmtmpD0,output_shift); mmtmpD1 = simde_mm_srai_epi32(mmtmpD1, output_shift);
mmtmpD1 = simde_mm_srai_epi32(mmtmpD1,output_shift); mmtmpD2 = simde_mm_unpacklo_epi32(mmtmpD0, mmtmpD1);
mmtmpD2 = simde_mm_unpacklo_epi32(mmtmpD0,mmtmpD1); mmtmpD3 = simde_mm_unpackhi_epi32(mmtmpD0, mmtmpD1);
mmtmpD3 = simde_mm_unpackhi_epi32(mmtmpD0,mmtmpD1); rho128[1] = simde_mm_packs_epi32(mmtmpD2, mmtmpD3);
rho128[1] =simde_mm_packs_epi32(mmtmpD2,mmtmpD3); // print_shorts("rx:",dl_ch128_2+1);
//print_shorts("rx:",dl_ch128_2+1); // print_shorts("ch:",dl_ch128+1);
//print_shorts("ch:",dl_ch128+1); // print_shorts("pack:",rho128+1);
//print_shorts("pack:",rho128+1);
// multiply by conjugated channel
// multiply by conjugated channel /*avg_rho_re[aarx][l*n_layers+atx] +=(((int16_t*)&rho128[1])[0]+
/*avg_rho_re[aarx][l*n_layers+atx] +=(((int16_t*)&rho128[1])[0]+ ((int16_t*)&rho128[1])[2] +
((int16_t*)&rho128[1])[2] + ((int16_t*)&rho128[1])[4] +
((int16_t*)&rho128[1])[4] + ((int16_t*)&rho128[1])[6])/16;*/
((int16_t*)&rho128[1])[6])/16;*/ /*avg_rho_im[aarx][l*n_layers+atx] +=(((int16_t*)&rho128[1])[1]+
/*avg_rho_im[aarx][l*n_layers+atx] +=(((int16_t*)&rho128[1])[1]+ ((int16_t*)&rho128[1])[3] +
((int16_t*)&rho128[1])[3] + ((int16_t*)&rho128[1])[5] +
((int16_t*)&rho128[1])[5] + ((int16_t*)&rho128[1])[7])/16;*/
((int16_t*)&rho128[1])[7])/16;*/
mmtmpD0 = simde_mm_madd_epi16(dl_ch128[2], dl_ch128_2[2]);
mmtmpD0 = simde_mm_madd_epi16(dl_ch128[2],dl_ch128_2[2]); // mmtmpD0 contains real part of 4 consecutive outputs (32-bit)
// mmtmpD0 contains real part of 4 consecutive outputs (32-bit) mmtmpD1 = simde_mm_shufflelo_epi16(dl_ch128[2], SIMDE_MM_SHUFFLE(2, 3, 0, 1));
mmtmpD1 = simde_mm_shufflelo_epi16(dl_ch128[2],SIMDE_MM_SHUFFLE(2,3,0,1)); mmtmpD1 = simde_mm_shufflehi_epi16(mmtmpD1, SIMDE_MM_SHUFFLE(2, 3, 0, 1));
mmtmpD1 = simde_mm_shufflehi_epi16(mmtmpD1,SIMDE_MM_SHUFFLE(2,3,0,1)); mmtmpD1 = simde_mm_sign_epi16(mmtmpD1, *(simde__m128i *)conjugate);
mmtmpD1 = simde_mm_sign_epi16(mmtmpD1,*(simde__m128i*)conjugate); mmtmpD1 = simde_mm_madd_epi16(mmtmpD1, dl_ch128_2[2]);
mmtmpD1 = simde_mm_madd_epi16(mmtmpD1,dl_ch128_2[2]); // mmtmpD1 contains imag part of 4 consecutive outputs (32-bit)
// mmtmpD1 contains imag part of 4 consecutive outputs (32-bit) mmtmpD0 = simde_mm_srai_epi32(mmtmpD0, output_shift);
mmtmpD0 = simde_mm_srai_epi32(mmtmpD0,output_shift); mmtmpD1 = simde_mm_srai_epi32(mmtmpD1, output_shift);
mmtmpD1 = simde_mm_srai_epi32(mmtmpD1,output_shift); mmtmpD2 = simde_mm_unpacklo_epi32(mmtmpD0, mmtmpD1);
mmtmpD2 = simde_mm_unpacklo_epi32(mmtmpD0,mmtmpD1); mmtmpD3 = simde_mm_unpackhi_epi32(mmtmpD0, mmtmpD1);
mmtmpD3 = simde_mm_unpackhi_epi32(mmtmpD0,mmtmpD1);
rho128[2] = simde_mm_packs_epi32(mmtmpD2, mmtmpD3);
rho128[2] = simde_mm_packs_epi32(mmtmpD2,mmtmpD3); // print_shorts("rx:",dl_ch128_2+2);
//print_shorts("rx:",dl_ch128_2+2); // print_shorts("ch:",dl_ch128+2);
//print_shorts("ch:",dl_ch128+2); // print_shorts("pack:",rho128+2);
//print_shorts("pack:",rho128+2);
/*avg_rho_re[aarx][l*n_layers+atx] +=(((int16_t*)&rho128[2])[0]+
/*avg_rho_re[aarx][l*n_layers+atx] +=(((int16_t*)&rho128[2])[0]+ ((int16_t*)&rho128[2])[2] +
((int16_t*)&rho128[2])[2] + ((int16_t*)&rho128[2])[4] +
((int16_t*)&rho128[2])[4] + ((int16_t*)&rho128[2])[6])/16;*/
((int16_t*)&rho128[2])[6])/16;*/ /*avg_rho_im[aarx][l*n_layers+atx] +=(((int16_t*)&rho128[2])[1]+
/*avg_rho_im[aarx][l*n_layers+atx] +=(((int16_t*)&rho128[2])[1]+ ((int16_t*)&rho128[2])[3] +
((int16_t*)&rho128[2])[3] + ((int16_t*)&rho128[2])[5] +
((int16_t*)&rho128[2])[5] + ((int16_t*)&rho128[2])[7])/16;*/
((int16_t*)&rho128[2])[7])/16;*/
dl_ch128+=3; dl_ch128+=3;
dl_ch128_2+=3; dl_ch128_2+=3;
......
Markdown is supported
0%
or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment