Commit 41a262c2 authored by yilmazt's avatar yilmazt

AVX512F intrinsics added for shortBlockCoding

parent d7e537fa
...@@ -362,7 +362,7 @@ void applyFtoleft(t_nrPolar_params *pp, decoder_node_t *node) { ...@@ -362,7 +362,7 @@ void applyFtoleft(t_nrPolar_params *pp, decoder_node_t *node) {
else else
#endif #endif
{ // equvalent scalar code to above, activated only on non x86/ARM architectures { // equivalent scalar code to above, activated only on non x86/ARM architectures
for (int i=0;i<node->Nv/2;i++) { for (int i=0;i<node->Nv/2;i++) {
a=alpha_v[i]; a=alpha_v[i];
b=alpha_v[i+(node->Nv/2)]; b=alpha_v[i+(node->Nv/2)];
...@@ -442,7 +442,7 @@ void applyGtoright(t_nrPolar_params *pp,decoder_node_t *node) { ...@@ -442,7 +442,7 @@ void applyGtoright(t_nrPolar_params *pp,decoder_node_t *node) {
} }
else else
#endif #endif
{// equvalent scalar code to above, activated only on non x86/ARM architectures or Nv=1,2 {// equivalent scalar code to above, activated only on non x86/ARM architectures or Nv=1,2
for (int i=0;i<node->Nv/2;i++) { for (int i=0;i<node->Nv/2;i++) {
alpha_r[i] = alpha_v[i+(node->Nv/2)] - (betal[i]*alpha_v[i]); alpha_r[i] = alpha_v[i+(node->Nv/2)] - (betal[i]*alpha_v[i]);
} }
......
...@@ -36,6 +36,9 @@ ...@@ -36,6 +36,9 @@
//#define DEBUG_DECODESMALLBLOCK //#define DEBUG_DECODESMALLBLOCK
//input = [d̂_0] [d̂_1] [d̂_2] ... [d̂_31]
//output = [? ... ? ĉ_K-1 ... ĉ_2 ĉ_1 ĉ_0]
uint16_t decodeSmallBlock(int8_t *in, uint8_t len){ uint16_t decodeSmallBlock(int8_t *in, uint8_t len){
uint16_t out = 0; uint16_t out = 0;
...@@ -80,23 +83,58 @@ uint16_t decodeSmallBlock(int8_t *in, uint8_t len){ ...@@ -80,23 +83,58 @@ uint16_t decodeSmallBlock(int8_t *in, uint8_t len){
#endif #endif
} else { } else {
uint8_t maxRow = 0, maxCol = 0;
#if defined(__AVX2__)
int16_t maxVal = 0;
int DmatrixElementVal = 0;
int8_t DmatrixElement[NR_SMALL_BLOCK_CODED_BITS] = {0};
__m256i _in_256 = _mm256_loadu_si256 ((__m256i*)&in[0]);
__m256i _maskD_256, _Dmatrixj_256, _maskH_256, _DmatrixElement_256;
for (int j = 0; j < ( 1<<(len-6) ); ++j) {
_maskD_256 = _mm256_loadu_si256 ((__m256i*)(&maskD[j][0]));
_Dmatrixj_256 = _mm256_sign_epi8 (_in_256, _maskD_256);
for (int k = 0; k < NR_SMALL_BLOCK_CODED_BITS; ++k) {
_maskH_256 = _mm256_loadu_si256 ((__m256i*)(&hadamard32InterleavedTransposed[k][0]));
_DmatrixElement_256 = _mm256_sign_epi8 (_Dmatrixj_256, _maskH_256);
#if defined(__AVX512F__)
DmatrixElementVal = _mm512_reduce_add_epi32 (
_mm512_add_epi32(
_mm512_cvtepi8_epi32 (_mm256_extracti128_si256 (_DmatrixElement_256, 1)),
_mm512_cvtepi8_epi32 (_mm256_castsi256_si128 (_DmatrixElement_256))
)
);
#else
_mm256_storeu_si256((__m256i*)(&DmatrixElement[0]), _DmatrixElement_256);
for (int i = 0; i < NR_SMALL_BLOCK_CODED_BITS; ++i)
DmatrixElementVal += DmatrixElement[i];
#endif
if (abs(DmatrixElementVal) > abs(maxVal)){
maxVal = DmatrixElementVal;
maxRow = j;
maxCol = k;
}
DmatrixElementVal=0;
}
}
out = properOrderedBasisExtended[maxRow] | properOrderedBasis[maxCol] | ( (maxVal > 0) ? (uint16_t)0 : (uint16_t)1 );
#else
int8_t Dmatrix[NR_SMALL_BLOCK_CODED_BITS][NR_SMALL_BLOCK_CODED_BITS] = {0}; int8_t Dmatrix[NR_SMALL_BLOCK_CODED_BITS][NR_SMALL_BLOCK_CODED_BITS] = {0};
int16_t DmatrixFHT[NR_SMALL_BLOCK_CODED_BITS][NR_SMALL_BLOCK_CODED_BITS] = {0}; int16_t DmatrixFHT[NR_SMALL_BLOCK_CODED_BITS][NR_SMALL_BLOCK_CODED_BITS] = {0};
uint16_t maxVal; uint16_t maxVal;
uint8_t maxRow = 0, maxCol = 0; uint8_t rowLimit = 1<<(len-6);
for (int j = 0; j < NR_SMALL_BLOCK_CODED_BITS; ++j) for (int j = 0; j < ( rowLimit ); ++j)
for (int k = 0; k < NR_SMALL_BLOCK_CODED_BITS; ++k) for (int k = 0; k < NR_SMALL_BLOCK_CODED_BITS; ++k)
Dmatrix[j][k] = in[k] * maskD[j][k]; Dmatrix[j][k] = in[k] * maskD[j][k];
for (int i = 0; i < ( rowLimit ); ++i)
for (int i = 0; i < NR_SMALL_BLOCK_CODED_BITS; ++i)
for (int j = 0; j < NR_SMALL_BLOCK_CODED_BITS; ++j) for (int j = 0; j < NR_SMALL_BLOCK_CODED_BITS; ++j)
for (int k = 0; k < NR_SMALL_BLOCK_CODED_BITS; ++k) for (int k = 0; k < NR_SMALL_BLOCK_CODED_BITS; ++k)
DmatrixFHT[i][j] += Dmatrix[i][k] * hadamard32InterleavedTransposed[j][k]; DmatrixFHT[i][j] += Dmatrix[i][k] * hadamard32InterleavedTransposed[j][k];
maxVal = abs(DmatrixFHT[0][0]); maxVal = abs(DmatrixFHT[0][0]);
for (int i = 0; i < NR_SMALL_BLOCK_CODED_BITS; ++i) for (int i = 0; i < ( rowLimit ); ++i)
for (int j = 0; j < NR_SMALL_BLOCK_CODED_BITS; ++j) for (int j = 0; j < NR_SMALL_BLOCK_CODED_BITS; ++j)
if (abs(DmatrixFHT[i][j]) > maxVal){ if (abs(DmatrixFHT[i][j]) > maxVal){
maxVal = abs(DmatrixFHT[i][j]); maxVal = abs(DmatrixFHT[i][j]);
...@@ -105,6 +143,7 @@ uint16_t decodeSmallBlock(int8_t *in, uint8_t len){ ...@@ -105,6 +143,7 @@ uint16_t decodeSmallBlock(int8_t *in, uint8_t len){
} }
out = properOrderedBasisExtended[maxRow] | properOrderedBasis[maxCol] | ( (DmatrixFHT[maxRow][maxCol] > 0) ? (uint16_t)0 : (uint16_t)1 ); out = properOrderedBasisExtended[maxRow] | properOrderedBasis[maxCol] | ( (DmatrixFHT[maxRow][maxCol] > 0) ? (uint16_t)0 : (uint16_t)1 );
#endif
#ifdef DEBUG_DECODESMALLBLOCK #ifdef DEBUG_DECODESMALLBLOCK
for (int k = 0; k < NR_SMALL_BLOCK_CODED_BITS; ++k) for (int k = 0; k < NR_SMALL_BLOCK_CODED_BITS; ++k)
......
Markdown is supported
0%
or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment