Commit ce7bdf71 authored by Laurent THOMAS's avatar Laurent THOMAS

fix simde lacking refix

parent 34970d38
......@@ -22,6 +22,9 @@
#include "nr_modulation.h"
#include "PHY/NR_REFSIG/nr_mod_table.h"
#include "executables/softmodem-common.h"
#include <simde/x86/avx512.h>
// Lacking declaration in present simde external package, will be detected as compilation error when they will add it
#define simde_mm512_extracti64x2_epi64(a...) _mm512_extracti64x2_epi64(a)
// #define DEBUG_DLSCH_PRECODING_PRINT_WITH_TRIVIAL // TODO: For debug, to be removed if want to merge to develop
// #define DEBUG_LAYER_MAPPING
......@@ -263,13 +266,13 @@ void nr_layer_mapping(int nbCodes,
c16_t *tx0 = tx_layers[0];
c16_t *tx1 = tx_layers[1];
#if defined(__AVX512BW__)
__m512i perm2a = _mm512_set_epi32(30, 28, 26, 24, 22, 20, 18, 16, 14, 12, 10, 8, 6, 4, 2, 0);
__m512i perm2b = _mm512_set_epi32(31, 29, 27, 25, 23, 21, 19, 17, 15, 13, 11, 9, 7, 5, 3, 1);
simde__m512i perm2a = simde_mm512_set_epi32(30, 28, 26, 24, 22, 20, 18, 16, 14, 12, 10, 8, 6, 4, 2, 0);
simde__m512i perm2b = simde_mm512_set_epi32(31, 29, 27, 25, 23, 21, 19, 17, 15, 13, 11, 9, 7, 5, 3, 1);
for (; i < (n_symbs & ~31); i += 32) {
__m512i a = *(__m512i *)(mod + i);
__m512i b = *(__m512i *)(mod + i + 16);
*(__m512i *)tx0 = _mm512_permutex2var_epi32(a, perm2a, b);
*(__m512i *)tx1 = _mm512_permutex2var_epi32(a, perm2b, b);
simde__m512i a = *(simde__m512i *)(mod + i);
simde__m512i b = *(simde__m512i *)(mod + i + 16);
*(simde__m512i *)tx0 = simde_mm512_permutex2var_epi32(a, perm2a, b);
*(simde__m512i *)tx1 = simde_mm512_permutex2var_epi32(a, perm2b, b);
tx0 += 16;
tx1 += 16;
}
......@@ -308,55 +311,69 @@ void nr_layer_mapping(int nbCodes,
c16_t *tx1 = tx_layers[1];
c16_t *tx2 = tx_layers[2];
#if defined(__AVX512BW__)
__m512i perm3_0 =
_mm512_set_epi32(13 + 16, 10 + 16, 7 + 16, 4 + 16, 1 + 16, 14 + 16, 11 + 16, 8 + 16, 5 + 16, 2 + 16, 15, 12, 9, 6, 3, 0);
__m512i perm3_0b = _mm512_set_epi32(13 + 16, 10 + 16, 7 + 16, 4 + 16, 1 + 16, 10, 9, 8, 7, 6, 5, 4, 3, 2, 1, 0);
__m512i perm3_1 = _mm512_set_epi32(14 + 16,
11 + 16,
8 + 16,
5 + 16,
2 + 16,
15 + 16,
12 + 16,
9 + 16,
6 + 16,
3 + 16,
0 + 16,
13,
10,
7,
4,
1);
__m512i perm3_1b = _mm512_set_epi32(14 + 16, 11 + 16, 8 + 16, 5 + 16, 2 + 16, 10, 9, 8, 7, 6, 5, 4, 3, 2, 1, 0);
__m512i perm3_2 = _mm512_set_epi32(15 + 16,
12 + 16,
9 + 16,
6 + 16,
3 + 16,
0 + 16,
13 + 16,
10 + 16,
7 + 16,
4 + 16,
1 + 16,
14,
11,
8,
5,
2);
__m512i perm3_2b = _mm512_set_epi32(15 + 16, 12 + 16, 9 + 16, 6 + 16, 3 + 16, 0 + 16, 9, 8, 7, 6, 5, 4, 3, 2, 1, 0);
simde__m512i perm3_0 = simde_mm512_set_epi32(13 + 16,
10 + 16,
7 + 16,
4 + 16,
1 + 16,
14 + 16,
11 + 16,
8 + 16,
5 + 16,
2 + 16,
15,
12,
9,
6,
3,
0);
simde__m512i perm3_0b = simde_mm512_set_epi32(13 + 16, 10 + 16, 7 + 16, 4 + 16, 1 + 16, 10, 9, 8, 7, 6, 5, 4, 3, 2, 1, 0);
simde__m512i perm3_1 = simde_mm512_set_epi32(14 + 16,
11 + 16,
8 + 16,
5 + 16,
2 + 16,
15 + 16,
12 + 16,
9 + 16,
6 + 16,
3 + 16,
0 + 16,
13,
10,
7,
4,
1);
simde__m512i perm3_1b = simde_mm512_set_epi32(14 + 16, 11 + 16, 8 + 16, 5 + 16, 2 + 16, 10, 9, 8, 7, 6, 5, 4, 3, 2, 1, 0);
simde__m512i perm3_2 = simde_mm512_set_epi32(15 + 16,
12 + 16,
9 + 16,
6 + 16,
3 + 16,
0 + 16,
13 + 16,
10 + 16,
7 + 16,
4 + 16,
1 + 16,
14,
11,
8,
5,
2);
simde__m512i perm3_2b = simde_mm512_set_epi32(15 + 16, 12 + 16, 9 + 16, 6 + 16, 3 + 16, 0 + 16, 9, 8, 7, 6, 5, 4, 3, 2, 1, 0);
for (; i < (n_symbs & ~63); i += 48) {
__m512i i0 = *(__m512i *)(mod + i);
__m512i i1 = *(__m512i *)(mod + i + 16);
__m512i i2 = *(__m512i *)(mod + i + 32);
__m512i d0 = _mm512_permutex2var_epi32(i0, perm3_0, i1);
*(__m512i *)tx0 = _mm512_permutex2var_epi32(d0, perm3_0b, i2); // 11000000
simde__m512i i0 = *(simde__m512i *)(mod + i);
simde__m512i i1 = *(simde__m512i *)(mod + i + 16);
simde__m512i i2 = *(simde__m512i *)(mod + i + 32);
simde__m512i d0 = simde_mm512_permutex2var_epi32(i0, perm3_0, i1);
*(simde__m512i *)tx0 = simde_mm512_permutex2var_epi32(d0, perm3_0b, i2); // 11000000
tx0 += 16;
d0 = _mm512_permutex2var_epi32(i0, perm3_1, i1);
*(__m512i *)tx1 = _mm512_permutex2var_epi32(d0, perm3_1b, i2); // 11000000
d0 = simde_mm512_permutex2var_epi32(i0, perm3_1, i1);
*(simde__m512i *)tx1 = simde_mm512_permutex2var_epi32(d0, perm3_1b, i2); // 11000000
tx1 += 16;
d0 = _mm512_permutex2var_epi32(i0, perm3_2, i1);
*(__m512i *)tx2 = _mm512_permutex2var_epi32(d0, perm3_2b, i2); // 11000000
d0 = simde_mm512_permutex2var_epi32(i0, perm3_2, i1);
*(simde__m512i *)tx2 = simde_mm512_permutex2var_epi32(d0, perm3_2b, i2); // 11000000
tx2 += 16;
}
#endif
......@@ -432,16 +449,16 @@ void nr_layer_mapping(int nbCodes,
c16_t *tx2 = tx_layers[2];
c16_t *tx3 = tx_layers[3];
#if defined(__AVX512BW__)
__m512i perm4 = _mm512_set_epi32(15, 11, 7, 3, 14, 10, 6, 2, 13, 9, 5, 1, 12, 8, 4, 0);
simde__m512i perm4 = simde_mm512_set_epi32(15, 11, 7, 3, 14, 10, 6, 2, 13, 9, 5, 1, 12, 8, 4, 0);
for (; i < (n_symbs & ~15); i += 16) {
__m512i e = _mm512_permutexvar_epi32(perm4, *(__m512i *)(mod + i));
*(simde__m128i *)tx0 = _mm512_extracti64x2_epi64(e, 0);
simde__m512i e = simde_mm512_permutexvar_epi32(perm4, *(simde__m512i *)(mod + i));
*(simde__m128i *)tx0 = simde_mm512_extracti64x2_epi64(e, 0);
tx0 += 4;
*(simde__m128i *)tx1 = _mm512_extracti64x2_epi64(e, 1);
*(simde__m128i *)tx1 = simde_mm512_extracti64x2_epi64(e, 1);
tx1 += 4;
*(simde__m128i *)tx2 = _mm512_extracti64x2_epi64(e, 2);
*(simde__m128i *)tx2 = simde_mm512_extracti64x2_epi64(e, 2);
tx2 += 4;
*(simde__m128i *)tx3 = _mm512_extracti64x2_epi64(e, 3);
*(simde__m128i *)tx3 = simde_mm512_extracti64x2_epi64(e, 3);
tx3 += 4;
}
#endif
......
......@@ -43,6 +43,7 @@
// #define DEBUG_DLSCH
// #define DEBUG_DLSCH_MAPPING
#include <simde/x86/avx512.h>
#define USE128BIT
static void nr_pdsch_codeword_scrambling(uint8_t *in, uint32_t size, uint8_t q, uint32_t Nid, uint32_t n_RNTI, uint32_t *out)
......@@ -106,14 +107,14 @@ static inline int dmrs_case2a(c16_t *txF, c16_t *mod_dmrs, const int amp_dmrs, i
int i = 0;
int end = sz / 2;
#if defined(__AVX512BW__)
__m512i zeros512 = _mm512_setzero_si512(), amp_dmrs512 = _mm512_set1_epi16(amp_dmrs);
__m512i perml = _mm512_set_epi32(23, 7, 22, 6, 21, 5, 20, 4, 19, 3, 18, 2, 17, 1, 16, 0);
__m512i permh = _mm512_set_epi32(31, 15, 30, 14, 29, 13, 28, 12, 27, 11, 26, 10, 25, 9, 24, 8);
simde__m512i zeros512 = simde_mm512_setzero_si512(), amp_dmrs512 = simde_mm512_set1_epi16(amp_dmrs);
simde__m512i perml = simde_mm512_set_epi32(23, 7, 22, 6, 21, 5, 20, 4, 19, 3, 18, 2, 17, 1, 16, 0);
simde__m512i permh = simde_mm512_set_epi32(31, 15, 30, 14, 29, 13, 28, 12, 27, 11, 26, 10, 25, 9, 24, 8);
for (; i < (end & ~15); i += 16) {
__m512i d0 = _mm512_mulhrs_epi16(_mm512_loadu_si512((__m512i *)(mod_dmrs + i)), amp_dmrs512);
_mm512_storeu_si512((__m512i *)out, _mm512_permutex2var_epi32(d0, perml, zeros512));
simde__m512i d0 = simde_mm512_mulhrs_epi16(_mm512_loadu_si512((simde__m512i *)(mod_dmrs + i)), amp_dmrs512);
simde_mm512_storeu_si512((simde__m512i *)out, simde_mm512_permutex2var_epi32(d0, perml, zeros512));
out += 16;
_mm512_storeu_si512((__m512i *)out, _mm512_permutex2var_epi32(d0, permh, zeros512));
simde_mm512_storeu_si512((simde__m512i *)out, simde_mm512_permutex2var_epi32(d0, permh, zeros512));
out += 16;
}
#endif
......@@ -157,14 +158,14 @@ static inline int dmrs_case2b(c16_t *txF, c16_t *mod_dmrs, const int amp_dmrs, i
int i = 0;
int end = sz / 2;
#if defined(__AVX512BW__)
__m512i zeros512 = _mm512_setzero_si512(), amp_dmrs512 = _mm512_set1_epi16(amp_dmrs);
__m512i perml = _mm512_set_epi32(23, 7, 22, 6, 21, 5, 20, 4, 19, 3, 18, 2, 17, 1, 16, 0);
__m512i permh = _mm512_set_epi32(31, 15, 30, 14, 29, 13, 28, 12, 27, 11, 26, 10, 25, 9, 24, 8);
simde__m512i zeros512 = simde_mm512_setzero_si512(), amp_dmrs512 = simde_mm512_set1_epi16(amp_dmrs);
simde__m512i perml = simde_mm512_set_epi32(23, 7, 22, 6, 21, 5, 20, 4, 19, 3, 18, 2, 17, 1, 16, 0);
simde__m512i permh = simde_mm512_set_epi32(31, 15, 30, 14, 29, 13, 28, 12, 27, 11, 26, 10, 25, 9, 24, 8);
for (; i < (end & ~15); i += 16) {
__m512i d0 = _mm512_mulhrs_epi16(_mm512_loadu_si512((__m512i *)(mod_dmrs + i)), amp_dmrs512);
_mm512_storeu_si512((__m512i *)out, _mm512_permutex2var_epi32(zeros512, perml, d0));
simde__m512i d0 = simde_mm512_mulhrs_epi16(_mm512_loadu_si512((simde__m512i *)(mod_dmrs + i)), amp_dmrs512);
simde_mm512_storeu_si512((simde__m512i *)out, simde_mm512_permutex2var_epi32(zeros512, perml, d0));
out += 16;
_mm512_storeu_si512((__m512i *)out, _mm512_permutex2var_epi32(zeros512, permh, d0));
simde_mm512_storeu_si512((simde__m512i *)out, simde_mm512_permutex2var_epi32(zeros512, permh, d0));
out += 16;
}
#endif
......@@ -209,16 +210,16 @@ static inline int dmrs_case1a(c16_t *txF, c16_t *txl, c16_t *mod_dmrs, const int
int i = 0;
int end = sz / 2;
#if defined(__AVX512BW__)
__m512i amp_dmrs512 = _mm512_set1_epi16(amp_dmrs), amp512 = _mm512_set1_epi16(amp);
__m512i perml = _mm512_set_epi32(23, 7, 22, 6, 21, 5, 20, 4, 19, 3, 18, 2, 17, 1, 16, 0);
__m512i permh = _mm512_set_epi32(31, 15, 30, 14, 29, 13, 28, 12, 27, 11, 26, 10, 25, 9, 24, 8);
simde__m512i amp_dmrs512 = simde_mm512_set1_epi16(amp_dmrs), amp512 = simde_mm512_set1_epi16(amp);
simde__m512i perml = simde_mm512_set_epi32(23, 7, 22, 6, 21, 5, 20, 4, 19, 3, 18, 2, 17, 1, 16, 0);
simde__m512i permh = simde_mm512_set_epi32(31, 15, 30, 14, 29, 13, 28, 12, 27, 11, 26, 10, 25, 9, 24, 8);
for (; i < (end & ~15); i += 16) {
__m512i d0 = _mm512_mulhrs_epi16(_mm512_loadu_si512((__m512i *)(mod_dmrs + i)), amp_dmrs512);
__m512i d1 = _mm512_mulhrs_epi16(_mm512_loadu_si512((__m512i *)txl), amp512);
simde__m512i d0 = simde_mm512_mulhrs_epi16(_mm512_loadu_si512((simde__m512i *)(mod_dmrs + i)), amp_dmrs512);
simde__m512i d1 = simde_mm512_mulhrs_epi16(_mm512_loadu_si512((simde__m512i *)txl), amp512);
txl += 16;
_mm512_storeu_si512((__m512i *)out, _mm512_permutex2var_epi32(d0, perml, d1));
simde_mm512_storeu_si512((simde__m512i *)out, simde_mm512_permutex2var_epi32(d0, perml, d1));
out += 16;
_mm512_storeu_si512((__m512i *)out, _mm512_permutex2var_epi32(d0, permh, d1));
simde_mm512_storeu_si512((simde__m512i *)out, simde_mm512_permutex2var_epi32(d0, permh, d1));
out += 16;
}
#endif
......@@ -266,16 +267,16 @@ static inline int dmrs_case1b(c16_t *txF, c16_t *txl, c16_t *mod_dmrs, const int
int i = 0;
int end = sz / 2;
#if defined(__AVX512BW__)
__m512i amp_dmrs512 = _mm512_set1_epi16(amp_dmrs), amp512 = _mm512_set1_epi16(amp);
__m512i perml = _mm512_set_epi32(23, 7, 22, 6, 21, 5, 20, 4, 19, 3, 18, 2, 17, 1, 16, 0);
__m512i permh = _mm512_set_epi32(31, 15, 30, 14, 29, 13, 28, 12, 27, 11, 26, 10, 25, 9, 24, 8);
simde__m512i amp_dmrs512 = simde_mm512_set1_epi16(amp_dmrs), amp512 = simde_mm512_set1_epi16(amp);
simde__m512i perml = simde_mm512_set_epi32(23, 7, 22, 6, 21, 5, 20, 4, 19, 3, 18, 2, 17, 1, 16, 0);
simde__m512i permh = simde_mm512_set_epi32(31, 15, 30, 14, 29, 13, 28, 12, 27, 11, 26, 10, 25, 9, 24, 8);
for (; i < (end & ~15); i += 16) {
__m512i d0 = _mm512_mulhrs_epi16(_mm512_loadu_si512((__m512i *)(mod_dmrs + i)), amp_dmrs512);
__m512i d1 = _mm512_mulhrs_epi16(_mm512_loadu_si512((__m512i *)txl), amp512);
simde__m512i d0 = simde_mm512_mulhrs_epi16(_mm512_loadu_si512((simde__m512i *)(mod_dmrs + i)), amp_dmrs512);
simde__m512i d1 = simde_mm512_mulhrs_epi16(_mm512_loadu_si512((simde__m512i *)txl), amp512);
txl += 16;
_mm512_storeu_si512((__m512i *)out, _mm512_permutex2var_epi32(d1, perml, d0));
simde_mm512_storeu_si512((simde__m512i *)out, simde_mm512_permutex2var_epi32(d1, perml, d0));
out += 16;
_mm512_storeu_si512((__m512i *)out, _mm512_permutex2var_epi32(d1, permh, d0));
simde_mm512_storeu_si512((simde__m512i *)out, simde_mm512_permutex2var_epi32(d1, permh, d0));
out += 16;
}
#endif
......@@ -364,10 +365,10 @@ static inline int no_ptrs_dmrs_case(c16_t *txF, c16_t *txl, const int amp, const
// Loop Over SCs:
int i = 0;
#if defined(__AVX512BW__)
__m512i amp512 = _mm512_set1_epi16(amp);
simde__m512i amp512 = simde_mm512_set1_epi16(amp);
for (; i < (sz & ~15); i += 16) {
const __m512i txL = _mm512_loadu_si512((__m512i *)(txl + i));
_mm512_storeu_si512((__m512i *)(txF + i), _mm512_mulhrs_epi16(amp512, txL));
const simde__m512i txL = simde_mm512_loadu_si512((simde__m512i *)(txl + i));
simde_mm512_storeu_si512((simde__m512i *)(txF + i), simde_mm512_mulhrs_epi16(amp512, txL));
}
#endif
#if defined(__AVX2__)
......
Markdown is supported
0%
or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment